71 238 4 280 280 243 68 149 197 197 273 386 270 83 81 214 128 214 214 214 262 262 243 207 243 266 396 16 14 66 327 83 85 6 275 238 217 250 12 215 133 57 275 156 83 83 133 156 27 29 238 128 83 90 83 2 327 11 2 2 2 162 239 135 166 194 480 463 171 88 236 17 17 238 238 236 83 1 1 1 327 327 327 327 327 327 16 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the TCP module. * * Version: @(#)tcp.h 1.0.5 05/23/93 * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> */ #ifndef _TCP_H #define _TCP_H #define FASTRETRANS_DEBUG 1 #include <linux/list.h> #include <linux/tcp.h> #include <linux/bug.h> #include <linux/slab.h> #include <linux/cache.h> #include <linux/percpu.h> #include <linux/skbuff.h> #include <linux/kref.h> #include <linux/ktime.h> #include <linux/indirect_call_wrapper.h> #include <net/inet_connection_sock.h> #include <net/inet_timewait_sock.h> #include <net/inet_hashtables.h> #include <net/checksum.h> #include <net/request_sock.h> #include <net/sock_reuseport.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ip.h> #include <net/tcp_states.h> #include <net/inet_ecn.h> #include <net/dst.h> #include <net/mptcp.h> #include <linux/seq_file.h> #include <linux/memcontrol.h> #include <linux/bpf-cgroup.h> #include <linux/siphash.h> extern struct inet_hashinfo tcp_hashinfo; DECLARE_PER_CPU(unsigned int, tcp_orphan_count); int tcp_orphan_count_sum(void); void tcp_time_wait(struct sock *sk, int state, int timeo); #define MAX_TCP_HEADER L1_CACHE_ALIGN(128 + MAX_HEADER) #define MAX_TCP_OPTION_SPACE 40 #define TCP_MIN_SND_MSS 48 #define TCP_MIN_GSO_SIZE (TCP_MIN_SND_MSS - MAX_TCP_OPTION_SPACE) /* * Never offer a window over 32767 without using window scaling. Some * poor stacks do signed 16bit maths! */ #define MAX_TCP_WINDOW 32767U /* Minimal accepted MSS. It is (60+60+8) - (20+20). */ #define TCP_MIN_MSS 88U /* The initial MTU to use for probing */ #define TCP_BASE_MSS 1024 /* probing interval, default to 10 minutes as per RFC4821 */ #define TCP_PROBE_INTERVAL 600 /* Specify interval when tcp mtu probing will stop */ #define TCP_PROBE_THRESHOLD 8 /* After receiving this amount of duplicate ACKs fast retransmit starts. */ #define TCP_FASTRETRANS_THRESH 3 /* Maximal number of ACKs sent quickly to accelerate slow-start. */ #define TCP_MAX_QUICKACKS 16U /* Maximal number of window scale according to RFC1323 */ #define TCP_MAX_WSCALE 14U /* urg_data states */ #define TCP_URG_VALID 0x0100 #define TCP_URG_NOTYET 0x0200 #define TCP_URG_READ 0x0400 #define TCP_RETR1 3 /* * This is how many retries it does before it * tries to figure out if the gateway is * down. Minimal RFC value is 3; it corresponds * to ~3sec-8min depending on RTO. */ #define TCP_RETR2 15 /* * This should take at least * 90 minutes to time out. * RFC1122 says that the limit is 100 sec. * 15 is ~13-30min depending on RTO. */ #define TCP_SYN_RETRIES 6 /* This is how many retries are done * when active opening a connection. * RFC1122 says the minimum retry MUST * be at least 180secs. Nevertheless * this value is corresponding to * 63secs of retransmission with the * current initial RTO. */ #define TCP_SYNACK_RETRIES 5 /* This is how may retries are done * when passive opening a connection. * This is corresponding to 31secs of * retransmission with the current * initial RTO. */ #define TCP_TIMEWAIT_LEN (60*HZ) /* how long to wait to destroy TIME-WAIT * state, about 60 seconds */ #define TCP_FIN_TIMEOUT TCP_TIMEWAIT_LEN /* BSD style FIN_WAIT2 deadlock breaker. * It used to be 3min, new value is 60sec, * to combine FIN-WAIT-2 timeout with * TIME-WAIT timer. */ #define TCP_FIN_TIMEOUT_MAX (120 * HZ) /* max TCP_LINGER2 value (two minutes) */ #define TCP_DELACK_MAX ((unsigned)(HZ/5)) /* maximal time to delay before sending an ACK */ #if HZ >= 100 #define TCP_DELACK_MIN ((unsigned)(HZ/25)) /* minimal time to delay before sending an ACK */ #define TCP_ATO_MIN ((unsigned)(HZ/25)) #else #define TCP_DELACK_MIN 4U #define TCP_ATO_MIN 4U #endif #define TCP_RTO_MAX ((unsigned)(120*HZ)) #define TCP_RTO_MIN ((unsigned)(HZ/5)) #define TCP_TIMEOUT_MIN (2U) /* Min timeout for TCP timers in jiffies */ #define TCP_TIMEOUT_MIN_US (2*USEC_PER_MSEC) /* Min TCP timeout in microsecs */ #define TCP_TIMEOUT_INIT ((unsigned)(1*HZ)) /* RFC6298 2.1 initial RTO value */ #define TCP_TIMEOUT_FALLBACK ((unsigned)(3*HZ)) /* RFC 1122 initial RTO value, now * used as a fallback RTO for the * initial data transmission if no * valid RTT sample has been acquired, * most likely due to retrans in 3WHS. */ #define TCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ/2U)) /* Maximal interval between probes * for local resources. */ #define TCP_KEEPALIVE_TIME (120*60*HZ) /* two hours */ #define TCP_KEEPALIVE_PROBES 9 /* Max of 9 keepalive probes */ #define TCP_KEEPALIVE_INTVL (75*HZ) #define MAX_TCP_KEEPIDLE 32767 #define MAX_TCP_KEEPINTVL 32767 #define MAX_TCP_KEEPCNT 127 #define MAX_TCP_SYNCNT 127 #define TCP_PAWS_24DAYS (60 * 60 * 24 * 24) #define TCP_PAWS_MSL 60 /* Per-host timestamps are invalidated * after this time. It should be equal * (or greater than) TCP_TIMEWAIT_LEN * to provide reliability equal to one * provided by timewait state. */ #define TCP_PAWS_WINDOW 1 /* Replay window for per-host * timestamps. It must be less than * minimal timewait lifetime. */ /* * TCP option */ #define TCPOPT_NOP 1 /* Padding */ #define TCPOPT_EOL 0 /* End of options */ #define TCPOPT_MSS 2 /* Segment size negotiating */ #define TCPOPT_WINDOW 3 /* Window scaling */ #define TCPOPT_SACK_PERM 4 /* SACK Permitted */ #define TCPOPT_SACK 5 /* SACK Block */ #define TCPOPT_TIMESTAMP 8 /* Better RTT estimations/PAWS */ #define TCPOPT_MD5SIG 19 /* MD5 Signature (RFC2385) */ #define TCPOPT_MPTCP 30 /* Multipath TCP (RFC6824) */ #define TCPOPT_FASTOPEN 34 /* Fast open (RFC7413) */ #define TCPOPT_EXP 254 /* Experimental */ /* Magic number to be after the option value for sharing TCP * experimental options. See draft-ietf-tcpm-experimental-options-00.txt */ #define TCPOPT_FASTOPEN_MAGIC 0xF989 #define TCPOPT_SMC_MAGIC 0xE2D4C3D9 /* * TCP option lengths */ #define TCPOLEN_MSS 4 #define TCPOLEN_WINDOW 3 #define TCPOLEN_SACK_PERM 2 #define TCPOLEN_TIMESTAMP 10 #define TCPOLEN_MD5SIG 18 #define TCPOLEN_FASTOPEN_BASE 2 #define TCPOLEN_EXP_FASTOPEN_BASE 4 #define TCPOLEN_EXP_SMC_BASE 6 /* But this is what stacks really send out. */ #define TCPOLEN_TSTAMP_ALIGNED 12 #define TCPOLEN_WSCALE_ALIGNED 4 #define TCPOLEN_SACKPERM_ALIGNED 4 #define TCPOLEN_SACK_BASE 2 #define TCPOLEN_SACK_BASE_ALIGNED 4 #define TCPOLEN_SACK_PERBLOCK 8 #define TCPOLEN_MD5SIG_ALIGNED 20 #define TCPOLEN_MSS_ALIGNED 4 #define TCPOLEN_EXP_SMC_BASE_ALIGNED 8 /* Flags in tp->nonagle */ #define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */ #define TCP_NAGLE_CORK 2 /* Socket is corked */ #define TCP_NAGLE_PUSH 4 /* Cork is overridden for already queued data */ /* TCP thin-stream limits */ #define TCP_THIN_LINEAR_RETRIES 6 /* After 6 linear retries, do exp. backoff */ /* TCP initial congestion window as per rfc6928 */ #define TCP_INIT_CWND 10 /* Bit Flags for sysctl_tcp_fastopen */ #define TFO_CLIENT_ENABLE 1 #define TFO_SERVER_ENABLE 2 #define TFO_CLIENT_NO_COOKIE 4 /* Data in SYN w/o cookie option */ /* Accept SYN data w/o any cookie option */ #define TFO_SERVER_COOKIE_NOT_REQD 0x200 /* Force enable TFO on all listeners, i.e., not requiring the * TCP_FASTOPEN socket option. */ #define TFO_SERVER_WO_SOCKOPT1 0x400 /* sysctl variables for tcp */ extern int sysctl_tcp_max_orphans; extern long sysctl_tcp_mem[3]; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ #define TCP_RACK_STATIC_REO_WND 0x2 /* Use static RACK reo wnd */ #define TCP_RACK_NO_DUPTHRESH 0x4 /* Do not use DUPACK threshold in RACK */ extern atomic_long_t tcp_memory_allocated; DECLARE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc); extern struct percpu_counter tcp_sockets_allocated; extern unsigned long tcp_memory_pressure; /* optimized version of sk_under_memory_pressure() for TCP sockets */ static inline bool tcp_under_memory_pressure(const struct sock *sk) { if (mem_cgroup_sockets_enabled && sk->sk_memcg && mem_cgroup_under_socket_pressure(sk->sk_memcg)) return true; return READ_ONCE(tcp_memory_pressure); } /* * The next routines deal with comparing 32 bit unsigned ints * and worry about wraparound (automatic with unsigned arithmetic). */ static inline bool before(__u32 seq1, __u32 seq2) { return (__s32)(seq1-seq2) < 0; } #define after(seq2, seq1) before(seq1, seq2) /* is s2<=s1<=s3 ? */ static inline bool between(__u32 seq1, __u32 seq2, __u32 seq3) { return seq3 - seq2 >= seq1 - seq2; } static inline bool tcp_out_of_memory(struct sock *sk) { if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && sk_memory_allocated(sk) > sk_prot_mem_limits(sk, 2)) return true; return false; } static inline void tcp_wmem_free_skb(struct sock *sk, struct sk_buff *skb) { sk_wmem_queued_add(sk, -skb->truesize); if (!skb_zcopy_pure(skb)) sk_mem_uncharge(sk, skb->truesize); else sk_mem_uncharge(sk, SKB_TRUESIZE(skb_end_offset(skb))); __kfree_skb(skb); } void sk_forced_mem_schedule(struct sock *sk, int size); bool tcp_check_oom(struct sock *sk, int shift); extern struct proto tcp_prot; #define TCP_INC_STATS(net, field) SNMP_INC_STATS((net)->mib.tcp_statistics, field) #define __TCP_INC_STATS(net, field) __SNMP_INC_STATS((net)->mib.tcp_statistics, field) #define TCP_DEC_STATS(net, field) SNMP_DEC_STATS((net)->mib.tcp_statistics, field) #define TCP_ADD_STATS(net, field, val) SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val) void tcp_tasklet_init(void); int tcp_v4_err(struct sk_buff *skb, u32); void tcp_shutdown(struct sock *sk, int how); int tcp_v4_early_demux(struct sk_buff *skb); int tcp_v4_rcv(struct sk_buff *skb); void tcp_remove_empty_skb(struct sock *sk); int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size); int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, size_t size, struct ubuf_info *uarg); void tcp_splice_eof(struct socket *sock); int tcp_send_mss(struct sock *sk, int *size_goal, int flags); int tcp_wmem_schedule(struct sock *sk, int copy); void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle, int size_goal); void tcp_release_cb(struct sock *sk); void tcp_wfree(struct sk_buff *skb); void tcp_write_timer_handler(struct sock *sk); void tcp_delack_timer_handler(struct sock *sk); int tcp_ioctl(struct sock *sk, int cmd, int *karg); int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb); void tcp_rcv_established(struct sock *sk, struct sk_buff *skb); void tcp_rcv_space_adjust(struct sock *sk); int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp); void tcp_twsk_destructor(struct sock *sk); void tcp_twsk_purge(struct list_head *net_exit_list, int family); ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp, bool force_schedule); static inline void tcp_dec_quickack_mode(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ack.quick) { /* How many ACKs S/ACKing new data have we sent? */ const unsigned int pkts = inet_csk_ack_scheduled(sk) ? 1 : 0; if (pkts >= icsk->icsk_ack.quick) { icsk->icsk_ack.quick = 0; /* Leaving quickack mode we deflate ATO. */ icsk->icsk_ack.ato = TCP_ATO_MIN; } else icsk->icsk_ack.quick -= pkts; } } #define TCP_ECN_OK 1 #define TCP_ECN_QUEUE_CWR 2 #define TCP_ECN_DEMAND_CWR 4 #define TCP_ECN_SEEN 8 enum tcp_tw_status { TCP_TW_SUCCESS = 0, TCP_TW_RST = 1, TCP_TW_ACK = 2, TCP_TW_SYN = 3 }; enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, const struct tcphdr *th); struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, bool fastopen, bool *lost_race); int tcp_child_process(struct sock *parent, struct sock *child, struct sk_buff *skb); void tcp_enter_loss(struct sock *sk); void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost, int flag); void tcp_clear_retrans(struct tcp_sock *tp); void tcp_update_metrics(struct sock *sk); void tcp_init_metrics(struct sock *sk); void tcp_metrics_init(void); bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst); void __tcp_close(struct sock *sk, long timeout); void tcp_close(struct sock *sk, long timeout); void tcp_init_sock(struct sock *sk); void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb); __poll_t tcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); int do_tcp_getsockopt(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen); int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); bool tcp_bpf_bypass_getsockopt(int level, int optname); int do_tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); void tcp_set_keepalive(struct sock *sk, int val); void tcp_syn_ack_timeout(const struct request_sock *req); int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len); int tcp_set_rcvlowat(struct sock *sk, int val); int tcp_set_window_clamp(struct sock *sk, int val); void tcp_update_recv_tstamps(struct sk_buff *skb, struct scm_timestamping_internal *tss); void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, struct scm_timestamping_internal *tss); void tcp_data_ready(struct sock *sk); #ifdef CONFIG_MMU int tcp_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma); #endif void tcp_parse_options(const struct net *net, const struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab, struct tcp_fastopen_cookie *foc); const u8 *tcp_parse_md5sig_option(const struct tcphdr *th); /* * BPF SKB-less helpers */ u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, struct tcphdr *th, u32 *cookie); u16 tcp_v6_get_syncookie(struct sock *sk, struct ipv6hdr *iph, struct tcphdr *th, u32 *cookie); u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss); u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct tcphdr *th); /* * TCP v4 functions exported for the inet6 API */ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb); void tcp_v4_mtu_reduced(struct sock *sk); void tcp_req_err(struct sock *sk, u32 seq, bool abort); void tcp_ld_RTO_revert(struct sock *sk, u32 seq); int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb); struct sock *tcp_create_openreq_child(const struct sock *sk, struct request_sock *req, struct sk_buff *skb); void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst); struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst, struct request_sock *req_unhash, bool *own_req); int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb); int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); int tcp_connect(struct sock *sk); enum tcp_synack_type { TCP_SYNACK_NORMAL, TCP_SYNACK_FASTOPEN, TCP_SYNACK_COOKIE, }; struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type, struct sk_buff *syn_skb); int tcp_disconnect(struct sock *sk, int flags); void tcp_finish_connect(struct sock *sk, struct sk_buff *skb); int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size); void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb); /* From syncookies.c */ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst, u32 tsoff); int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th, u32 cookie); struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb); struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct sk_buff *skb); #ifdef CONFIG_SYN_COOKIES /* Syncookies use a monotonic timer which increments every 60 seconds. * This counter is used both as a hash input and partially encoded into * the cookie value. A cookie is only validated further if the delta * between the current counter value and the encoded one is less than this, * i.e. a sent cookie is valid only at most for 2*60 seconds (or less if * the counter advances immediately after a cookie is generated). */ #define MAX_SYNCOOKIE_AGE 2 #define TCP_SYNCOOKIE_PERIOD (60 * HZ) #define TCP_SYNCOOKIE_VALID (MAX_SYNCOOKIE_AGE * TCP_SYNCOOKIE_PERIOD) /* syncookies: remember time of last synqueue overflow * But do not dirty this field too often (once per second is enough) * It is racy as we do not hold a lock, but race is very minor. */ static inline void tcp_synq_overflow(const struct sock *sk) { unsigned int last_overflow; unsigned int now = jiffies; if (sk->sk_reuseport) { struct sock_reuseport *reuse; reuse = rcu_dereference(sk->sk_reuseport_cb); if (likely(reuse)) { last_overflow = READ_ONCE(reuse->synq_overflow_ts); if (!time_between32(now, last_overflow, last_overflow + HZ)) WRITE_ONCE(reuse->synq_overflow_ts, now); return; } } last_overflow = READ_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp); if (!time_between32(now, last_overflow, last_overflow + HZ)) WRITE_ONCE(tcp_sk_rw(sk)->rx_opt.ts_recent_stamp, now); } /* syncookies: no recent synqueue overflow on this listening socket? */ static inline bool tcp_synq_no_recent_overflow(const struct sock *sk) { unsigned int last_overflow; unsigned int now = jiffies; if (sk->sk_reuseport) { struct sock_reuseport *reuse; reuse = rcu_dereference(sk->sk_reuseport_cb); if (likely(reuse)) { last_overflow = READ_ONCE(reuse->synq_overflow_ts); return !time_between32(now, last_overflow - HZ, last_overflow + TCP_SYNCOOKIE_VALID); } } last_overflow = READ_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp); /* If last_overflow <= jiffies <= last_overflow + TCP_SYNCOOKIE_VALID, * then we're under synflood. However, we have to use * 'last_overflow - HZ' as lower bound. That's because a concurrent * tcp_synq_overflow() could update .ts_recent_stamp after we read * jiffies but before we store .ts_recent_stamp into last_overflow, * which could lead to rejecting a valid syncookie. */ return !time_between32(now, last_overflow - HZ, last_overflow + TCP_SYNCOOKIE_VALID); } static inline u32 tcp_cookie_time(void) { u64 val = get_jiffies_64(); do_div(val, TCP_SYNCOOKIE_PERIOD); return val; } u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, u16 *mssp); __u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss); u64 cookie_init_timestamp(struct request_sock *req, u64 now); bool cookie_timestamp_decode(const struct net *net, struct tcp_options_received *opt); bool cookie_ecn_ok(const struct tcp_options_received *opt, const struct net *net, const struct dst_entry *dst); /* From net/ipv6/syncookies.c */ int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th, u32 cookie); struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb); u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph, const struct tcphdr *th, u16 *mssp); __u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mss); #endif /* tcp_output.c */ void tcp_skb_entail(struct sock *sk, struct sk_buff *skb); void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb); void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, int nonagle); int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs); int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs); void tcp_retransmit_timer(struct sock *sk); void tcp_xmit_retransmit_queue(struct sock *); void tcp_simple_retransmit(struct sock *); void tcp_enter_recovery(struct sock *sk, bool ece_ack); int tcp_trim_head(struct sock *, struct sk_buff *, u32); enum tcp_queue { TCP_FRAG_IN_WRITE_QUEUE, TCP_FRAG_IN_RTX_QUEUE, }; int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, struct sk_buff *skb, u32 len, unsigned int mss_now, gfp_t gfp); void tcp_send_probe0(struct sock *); int tcp_write_wakeup(struct sock *, int mib); void tcp_send_fin(struct sock *sk); void tcp_send_active_reset(struct sock *sk, gfp_t priority); int tcp_send_synack(struct sock *); void tcp_push_one(struct sock *, unsigned int mss_now); void __tcp_send_ack(struct sock *sk, u32 rcv_nxt); void tcp_send_ack(struct sock *sk); void tcp_send_delayed_ack(struct sock *sk); void tcp_send_loss_probe(struct sock *sk); bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto); void tcp_skb_collapse_tstamp(struct sk_buff *skb, const struct sk_buff *next_skb); /* tcp_input.c */ void tcp_rearm_rto(struct sock *sk); void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req); void tcp_reset(struct sock *sk, struct sk_buff *skb); void tcp_fin(struct sock *sk); void tcp_check_space(struct sock *sk); void tcp_sack_compress_send_ack(struct sock *sk); /* tcp_timer.c */ void tcp_init_xmit_timers(struct sock *); static inline void tcp_clear_xmit_timers(struct sock *sk) { if (hrtimer_try_to_cancel(&tcp_sk(sk)->pacing_timer) == 1) __sock_put(sk); if (hrtimer_try_to_cancel(&tcp_sk(sk)->compressed_ack_timer) == 1) __sock_put(sk); inet_csk_clear_xmit_timers(sk); } unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu); unsigned int tcp_current_mss(struct sock *sk); u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when); /* Bound MSS / TSO packet size with the half of the window */ static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize) { int cutoff; /* When peer uses tiny windows, there is no use in packetizing * to sub-MSS pieces for the sake of SWS or making sure there * are enough packets in the pipe for fast recovery. * * On the other hand, for extremely large MSS devices, handling * smaller than MSS windows in this way does make sense. */ if (tp->max_window > TCP_MSS_DEFAULT) cutoff = (tp->max_window >> 1); else cutoff = tp->max_window; if (cutoff && pktsize > cutoff) return max_t(int, cutoff, 68U - tp->tcp_header_len); else return pktsize; } /* tcp.c */ void tcp_get_info(struct sock *, struct tcp_info *); /* Read 'sendfile()'-style from a TCP socket */ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor); int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor); struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off); void tcp_read_done(struct sock *sk, size_t len); void tcp_initialize_rcv_mss(struct sock *sk); int tcp_mtu_to_mss(struct sock *sk, int pmtu); int tcp_mss_to_mtu(struct sock *sk, int mss); void tcp_mtup_init(struct sock *sk); static inline void tcp_bound_rto(const struct sock *sk) { if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX) inet_csk(sk)->icsk_rto = TCP_RTO_MAX; } static inline u32 __tcp_set_rto(const struct tcp_sock *tp) { return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us); } static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) { /* mptcp hooks are only on the slow path */ if (sk_is_mptcp((struct sock *)tp)) return; tp->pred_flags = htonl((tp->tcp_header_len << 26) | ntohl(TCP_FLAG_ACK) | snd_wnd); } static inline void tcp_fast_path_on(struct tcp_sock *tp) { __tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale); } static inline void tcp_fast_path_check(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (RB_EMPTY_ROOT(&tp->out_of_order_queue) && tp->rcv_wnd && atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf && !tp->urg_data) tcp_fast_path_on(tp); } /* Compute the actual rto_min value */ static inline u32 tcp_rto_min(struct sock *sk) { const struct dst_entry *dst = __sk_dst_get(sk); u32 rto_min = inet_csk(sk)->icsk_rto_min; if (dst && dst_metric_locked(dst, RTAX_RTO_MIN)) rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN); return rto_min; } static inline u32 tcp_rto_min_us(struct sock *sk) { return jiffies_to_usecs(tcp_rto_min(sk)); } static inline bool tcp_ca_dst_locked(const struct dst_entry *dst) { return dst_metric_locked(dst, RTAX_CC_ALGO); } /* Minimum RTT in usec. ~0 means not available. */ static inline u32 tcp_min_rtt(const struct tcp_sock *tp) { return minmax_get(&tp->rtt_min); } /* Compute the actual receive window we are currently advertising. * Rcv_nxt can be after the window if our peer push more data * than the offered window. */ static inline u32 tcp_receive_window(const struct tcp_sock *tp) { s32 win = tp->rcv_wup + tp->rcv_wnd - tp->rcv_nxt; if (win < 0) win = 0; return (u32) win; } /* Choose a new window, without checks for shrinking, and without * scaling applied to the result. The caller does these things * if necessary. This is a "raw" window selection. */ u32 __tcp_select_window(struct sock *sk); void tcp_send_window_probe(struct sock *sk); /* TCP uses 32bit jiffies to save some space. * Note that this is different from tcp_time_stamp, which * historically has been the same until linux-4.13. */ #define tcp_jiffies32 ((u32)jiffies) /* * Deliver a 32bit value for TCP timestamp option (RFC 7323) * It is no longer tied to jiffies, but to 1 ms clock. * Note: double check if you want to use tcp_jiffies32 instead of this. */ #define TCP_TS_HZ 1000 static inline u64 tcp_clock_ns(void) { return ktime_get_ns(); } static inline u64 tcp_clock_us(void) { return div_u64(tcp_clock_ns(), NSEC_PER_USEC); } /* This should only be used in contexts where tp->tcp_mstamp is up to date */ static inline u32 tcp_time_stamp(const struct tcp_sock *tp) { return div_u64(tp->tcp_mstamp, USEC_PER_SEC / TCP_TS_HZ); } /* Convert a nsec timestamp into TCP TSval timestamp (ms based currently) */ static inline u32 tcp_ns_to_ts(u64 ns) { return div_u64(ns, NSEC_PER_SEC / TCP_TS_HZ); } /* Could use tcp_clock_us() / 1000, but this version uses a single divide */ static inline u32 tcp_time_stamp_raw(void) { return tcp_ns_to_ts(tcp_clock_ns()); } void tcp_mstamp_refresh(struct tcp_sock *tp); static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) { return max_t(s64, t1 - t0, 0); } static inline u32 tcp_skb_timestamp(const struct sk_buff *skb) { return tcp_ns_to_ts(skb->skb_mstamp_ns); } /* provide the departure time in us unit */ static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb) { return div_u64(skb->skb_mstamp_ns, NSEC_PER_USEC); } #define tcp_flag_byte(th) (((u_int8_t *)th)[13]) #define TCPHDR_FIN 0x01 #define TCPHDR_SYN 0x02 #define TCPHDR_RST 0x04 #define TCPHDR_PSH 0x08 #define TCPHDR_ACK 0x10 #define TCPHDR_URG 0x20 #define TCPHDR_ECE 0x40 #define TCPHDR_CWR 0x80 #define TCPHDR_SYN_ECN (TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR) /* This is what the send packet queuing engine uses to pass * TCP per-packet control information to the transmission code. * We also store the host-order sequence numbers in here too. * This is 44 bytes if IPV6 is enabled. * If this grows please adjust skbuff.h:skbuff->cb[xxx] size appropriately. */ struct tcp_skb_cb { __u32 seq; /* Starting sequence number */ __u32 end_seq; /* SEQ + FIN + SYN + datalen */ union { /* Note : tcp_tw_isn is used in input path only * (isn chosen by tcp_timewait_state_process()) * * tcp_gso_segs/size are used in write queue only, * cf tcp_skb_pcount()/tcp_skb_mss() */ __u32 tcp_tw_isn; struct { u16 tcp_gso_segs; u16 tcp_gso_size; }; }; __u8 tcp_flags; /* TCP header flags. (tcp[13]) */ __u8 sacked; /* State flags for SACK. */ #define TCPCB_SACKED_ACKED 0x01 /* SKB ACK'd by a SACK block */ #define TCPCB_SACKED_RETRANS 0x02 /* SKB retransmitted */ #define TCPCB_LOST 0x04 /* SKB is lost */ #define TCPCB_TAGBITS 0x07 /* All tag bits */ #define TCPCB_REPAIRED 0x10 /* SKB repaired (no skb_mstamp_ns) */ #define TCPCB_EVER_RETRANS 0x80 /* Ever retransmitted frame */ #define TCPCB_RETRANS (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS| \ TCPCB_REPAIRED) __u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */ __u8 txstamp_ack:1, /* Record TX timestamp for ack? */ eor:1, /* Is skb MSG_EOR marked? */ has_rxtstamp:1, /* SKB has a RX timestamp */ unused:5; __u32 ack_seq; /* Sequence number ACK'd */ union { struct { #define TCPCB_DELIVERED_CE_MASK ((1U<<20) - 1) /* There is space for up to 24 bytes */ __u32 is_app_limited:1, /* cwnd not fully used? */ delivered_ce:20, unused:11; /* pkts S/ACKed so far upon tx of skb, incl retrans: */ __u32 delivered; /* start of send pipeline phase */ u64 first_tx_mstamp; /* when we reached the "delivered" count */ u64 delivered_mstamp; } tx; /* only used for outgoing skbs */ union { struct inet_skb_parm h4; #if IS_ENABLED(CONFIG_IPV6) struct inet6_skb_parm h6; #endif } header; /* For incoming skbs */ }; }; #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) extern const struct inet_connection_sock_af_ops ipv4_specific; #if IS_ENABLED(CONFIG_IPV6) /* This is the variant of inet6_iif() that must be used by TCP, * as TCP moves IP6CB into a different location in skb->cb[] */ static inline int tcp_v6_iif(const struct sk_buff *skb) { return TCP_SKB_CB(skb)->header.h6.iif; } static inline int tcp_v6_iif_l3_slave(const struct sk_buff *skb) { bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags); return l3_slave ? skb->skb_iif : TCP_SKB_CB(skb)->header.h6.iif; } /* TCP_SKB_CB reference means this can not be used from early demux */ static inline int tcp_v6_sdif(const struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) if (skb && ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags)) return TCP_SKB_CB(skb)->header.h6.iif; #endif return 0; } extern const struct inet_connection_sock_af_ops ipv6_specific; INDIRECT_CALLABLE_DECLARE(void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb)); INDIRECT_CALLABLE_DECLARE(int tcp_v6_rcv(struct sk_buff *skb)); void tcp_v6_early_demux(struct sk_buff *skb); #endif /* TCP_SKB_CB reference means this can not be used from early demux */ static inline int tcp_v4_sdif(struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) if (skb && ipv4_l3mdev_skb(TCP_SKB_CB(skb)->header.h4.flags)) return TCP_SKB_CB(skb)->header.h4.iif; #endif return 0; } /* Due to TSO, an SKB can be composed of multiple actual * packets. To keep these tracked properly, we use this. */ static inline int tcp_skb_pcount(const struct sk_buff *skb) { return TCP_SKB_CB(skb)->tcp_gso_segs; } static inline void tcp_skb_pcount_set(struct sk_buff *skb, int segs) { TCP_SKB_CB(skb)->tcp_gso_segs = segs; } static inline void tcp_skb_pcount_add(struct sk_buff *skb, int segs) { TCP_SKB_CB(skb)->tcp_gso_segs += segs; } /* This is valid iff skb is in write queue and tcp_skb_pcount() > 1. */ static inline int tcp_skb_mss(const struct sk_buff *skb) { return TCP_SKB_CB(skb)->tcp_gso_size; } static inline bool tcp_skb_can_collapse_to(const struct sk_buff *skb) { return likely(!TCP_SKB_CB(skb)->eor); } static inline bool tcp_skb_can_collapse(const struct sk_buff *to, const struct sk_buff *from) { return likely(tcp_skb_can_collapse_to(to) && mptcp_skb_can_collapse(to, from) && skb_pure_zcopy_same(to, from)); } /* Events passed to congestion control interface */ enum tcp_ca_event { CA_EVENT_TX_START, /* first transmit when no packets in flight */ CA_EVENT_CWND_RESTART, /* congestion window restart */ CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */ CA_EVENT_LOSS, /* loss timeout */ CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */ CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */ }; /* Information about inbound ACK, passed to cong_ops->in_ack_event() */ enum tcp_ca_ack_event_flags { CA_ACK_SLOWPATH = (1 << 0), /* In slow path processing */ CA_ACK_WIN_UPDATE = (1 << 1), /* ACK updated window */ CA_ACK_ECE = (1 << 2), /* ECE bit is set on ack */ }; /* * Interface for adding new TCP congestion control handlers */ #define TCP_CA_NAME_MAX 16 #define TCP_CA_MAX 128 #define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX) #define TCP_CA_UNSPEC 0 /* Algorithm can be set on socket without CAP_NET_ADMIN privileges */ #define TCP_CONG_NON_RESTRICTED 0x1 /* Requires ECN/ECT set on all packets */ #define TCP_CONG_NEEDS_ECN 0x2 #define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN) union tcp_cc_info; struct ack_sample { u32 pkts_acked; s32 rtt_us; u32 in_flight; }; /* A rate sample measures the number of (original/retransmitted) data * packets delivered "delivered" over an interval of time "interval_us". * The tcp_rate.c code fills in the rate sample, and congestion * control modules that define a cong_control function to run at the end * of ACK processing can optionally chose to consult this sample when * setting cwnd and pacing rate. * A sample is invalid if "delivered" or "interval_us" is negative. */ struct rate_sample { u64 prior_mstamp; /* starting timestamp for interval */ u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ u32 prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */ s32 delivered; /* number of packets delivered over interval */ s32 delivered_ce; /* number of packets delivered w/ CE marks*/ long interval_us; /* time for tp->delivered to incr "delivered" */ u32 snd_interval_us; /* snd interval for delivered packets */ u32 rcv_interval_us; /* rcv interval for delivered packets */ long rtt_us; /* RTT of last (S)ACKed packet (or -1) */ int losses; /* number of packets marked lost upon ACK */ u32 acked_sacked; /* number of packets newly (S)ACKed upon ACK */ u32 prior_in_flight; /* in flight before this ACK */ u32 last_end_seq; /* end_seq of most recently ACKed packet */ bool is_app_limited; /* is sample from packet with bubble in pipe? */ bool is_retrans; /* is sample from retransmission? */ bool is_ack_delayed; /* is this (likely) a delayed ACK? */ }; struct tcp_congestion_ops { /* fast path fields are put first to fill one cache line */ /* return slow start threshold (required) */ u32 (*ssthresh)(struct sock *sk); /* do new cwnd calculation (required) */ void (*cong_avoid)(struct sock *sk, u32 ack, u32 acked); /* call before changing ca_state (optional) */ void (*set_state)(struct sock *sk, u8 new_state); /* call when cwnd event occurs (optional) */ void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev); /* call when ack arrives (optional) */ void (*in_ack_event)(struct sock *sk, u32 flags); /* hook for packet ack accounting (optional) */ void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); /* override sysctl_tcp_min_tso_segs */ u32 (*min_tso_segs)(struct sock *sk); /* call when packets are delivered to update cwnd and pacing rate, * after all the ca_state processing. (optional) */ void (*cong_control)(struct sock *sk, const struct rate_sample *rs); /* new value of cwnd after loss (required) */ u32 (*undo_cwnd)(struct sock *sk); /* returns the multiplier used in tcp_sndbuf_expand (optional) */ u32 (*sndbuf_expand)(struct sock *sk); /* control/slow paths put last */ /* get info for inet_diag (optional) */ size_t (*get_info)(struct sock *sk, u32 ext, int *attr, union tcp_cc_info *info); char name[TCP_CA_NAME_MAX]; struct module *owner; struct list_head list; u32 key; u32 flags; /* initialize private data (optional) */ void (*init)(struct sock *sk); /* cleanup private data (optional) */ void (*release)(struct sock *sk); } ____cacheline_aligned_in_smp; int tcp_register_congestion_control(struct tcp_congestion_ops *type); void tcp_unregister_congestion_control(struct tcp_congestion_ops *type); int tcp_update_congestion_control(struct tcp_congestion_ops *type, struct tcp_congestion_ops *old_type); int tcp_validate_congestion_control(struct tcp_congestion_ops *ca); void tcp_assign_congestion_control(struct sock *sk); void tcp_init_congestion_control(struct sock *sk); void tcp_cleanup_congestion_control(struct sock *sk); int tcp_set_default_congestion_control(struct net *net, const char *name); void tcp_get_default_congestion_control(struct net *net, char *name); void tcp_get_available_congestion_control(char *buf, size_t len); void tcp_get_allowed_congestion_control(char *buf, size_t len); int tcp_set_allowed_congestion_control(char *allowed); int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool cap_net_admin); u32 tcp_slow_start(struct tcp_sock *tp, u32 acked); void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked); u32 tcp_reno_ssthresh(struct sock *sk); u32 tcp_reno_undo_cwnd(struct sock *sk); void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked); extern struct tcp_congestion_ops tcp_reno; struct tcp_congestion_ops *tcp_ca_find(const char *name); struct tcp_congestion_ops *tcp_ca_find_key(u32 key); u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca); #ifdef CONFIG_INET char *tcp_ca_get_name_by_key(u32 key, char *buffer); #else static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer) { return NULL; } #endif static inline bool tcp_ca_needs_ecn(const struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ECN; } static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) { const struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ca_ops->cwnd_event) icsk->icsk_ca_ops->cwnd_event(sk, event); } /* From tcp_cong.c */ void tcp_set_ca_state(struct sock *sk, const u8 ca_state); /* From tcp_rate.c */ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb); void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, struct rate_sample *rs); void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, bool is_sack_reneg, struct rate_sample *rs); void tcp_rate_check_app_limited(struct sock *sk); static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) { return t1 > t2 || (t1 == t2 && after(seq1, seq2)); } /* These functions determine how the current flow behaves in respect of SACK * handling. SACK is negotiated with the peer, and therefore it can vary * between different flows. * * tcp_is_sack - SACK enabled * tcp_is_reno - No SACK */ static inline int tcp_is_sack(const struct tcp_sock *tp) { return likely(tp->rx_opt.sack_ok); } static inline bool tcp_is_reno(const struct tcp_sock *tp) { return !tcp_is_sack(tp); } static inline unsigned int tcp_left_out(const struct tcp_sock *tp) { return tp->sacked_out + tp->lost_out; } /* This determines how many packets are "in the network" to the best * of our knowledge. In many cases it is conservative, but where * detailed information is available from the receiver (via SACK * blocks etc.) we can make more aggressive calculations. * * Use this for decisions involving congestion control, use just * tp->packets_out to determine if the send queue is empty or not. * * Read this equation as: * * "Packets sent once on transmission queue" MINUS * "Packets left network, but not honestly ACKed yet" PLUS * "Packets fast retransmitted" */ static inline unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) { return tp->packets_out - tcp_left_out(tp) + tp->retrans_out; } #define TCP_INFINITE_SSTHRESH 0x7fffffff static inline u32 tcp_snd_cwnd(const struct tcp_sock *tp) { return tp->snd_cwnd; } static inline void tcp_snd_cwnd_set(struct tcp_sock *tp, u32 val) { WARN_ON_ONCE((int)val <= 0); tp->snd_cwnd = val; } static inline bool tcp_in_slow_start(const struct tcp_sock *tp) { return tcp_snd_cwnd(tp) < tp->snd_ssthresh; } static inline bool tcp_in_initial_slowstart(const struct tcp_sock *tp) { return tp->snd_ssthresh >= TCP_INFINITE_SSTHRESH; } static inline bool tcp_in_cwnd_reduction(const struct sock *sk) { return (TCPF_CA_CWR | TCPF_CA_Recovery) & (1 << inet_csk(sk)->icsk_ca_state); } /* If cwnd > ssthresh, we may raise ssthresh to be half-way to cwnd. * The exception is cwnd reduction phase, when cwnd is decreasing towards * ssthresh. */ static inline __u32 tcp_current_ssthresh(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); if (tcp_in_cwnd_reduction(sk)) return tp->snd_ssthresh; else return max(tp->snd_ssthresh, ((tcp_snd_cwnd(tp) >> 1) + (tcp_snd_cwnd(tp) >> 2))); } /* Use define here intentionally to get WARN_ON location shown at the caller */ #define tcp_verify_left_out(tp) WARN_ON(tcp_left_out(tp) > tp->packets_out) void tcp_enter_cwr(struct sock *sk); __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst); /* The maximum number of MSS of available cwnd for which TSO defers * sending if not using sysctl_tcp_tso_win_divisor. */ static inline __u32 tcp_max_tso_deferred_mss(const struct tcp_sock *tp) { return 3; } /* Returns end sequence number of the receiver's advertised window */ static inline u32 tcp_wnd_end(const struct tcp_sock *tp) { return tp->snd_una + tp->snd_wnd; } /* We follow the spirit of RFC2861 to validate cwnd but implement a more * flexible approach. The RFC suggests cwnd should not be raised unless * it was fully used previously. And that's exactly what we do in * congestion avoidance mode. But in slow start we allow cwnd to grow * as long as the application has used half the cwnd. * Example : * cwnd is 10 (IW10), but application sends 9 frames. * We allow cwnd to reach 18 when all frames are ACKed. * This check is safe because it's as aggressive as slow start which already * risks 100% overshoot. The advantage is that we discourage application to * either send more filler packets or data to artificially blow up the cwnd * usage, and allow application-limited process to probe bw more aggressively. */ static inline bool tcp_is_cwnd_limited(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); if (tp->is_cwnd_limited) return true; /* If in slow start, ensure cwnd grows to twice what was ACKed. */ if (tcp_in_slow_start(tp)) return tcp_snd_cwnd(tp) < 2 * tp->max_packets_out; return false; } /* BBR congestion control needs pacing. * Same remark for SO_MAX_PACING_RATE. * sch_fq packet scheduler is efficiently handling pacing, * but is not always installed/used. * Return true if TCP stack should pace packets itself. */ static inline bool tcp_needs_internal_pacing(const struct sock *sk) { return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED; } /* Estimates in how many jiffies next packet for this flow can be sent. * Scheduling a retransmit timer too early would be silly. */ static inline unsigned long tcp_pacing_delay(const struct sock *sk) { s64 delay = tcp_sk(sk)->tcp_wstamp_ns - tcp_sk(sk)->tcp_clock_cache; return delay > 0 ? nsecs_to_jiffies(delay) : 0; } static inline void tcp_reset_xmit_timer(struct sock *sk, const int what, unsigned long when, const unsigned long max_when) { inet_csk_reset_xmit_timer(sk, what, when + tcp_pacing_delay(sk), max_when); } /* Something is really bad, we could not queue an additional packet, * because qdisc is full or receiver sent a 0 window, or we are paced. * We do not want to add fuel to the fire, or abort too early, * so make sure the timer we arm now is at least 200ms in the future, * regardless of current icsk_rto value (as it could be ~2ms) */ static inline unsigned long tcp_probe0_base(const struct sock *sk) { return max_t(unsigned long, inet_csk(sk)->icsk_rto, TCP_RTO_MIN); } /* Variant of inet_csk_rto_backoff() used for zero window probes */ static inline unsigned long tcp_probe0_when(const struct sock *sk, unsigned long max_when) { u8 backoff = min_t(u8, ilog2(TCP_RTO_MAX / TCP_RTO_MIN) + 1, inet_csk(sk)->icsk_backoff); u64 when = (u64)tcp_probe0_base(sk) << backoff; return (unsigned long)min_t(u64, when, max_when); } static inline void tcp_check_probe_timer(struct sock *sk) { if (!tcp_sk(sk)->packets_out && !inet_csk(sk)->icsk_pending) tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, tcp_probe0_base(sk), TCP_RTO_MAX); } static inline void tcp_init_wl(struct tcp_sock *tp, u32 seq) { tp->snd_wl1 = seq; } static inline void tcp_update_wl(struct tcp_sock *tp, u32 seq) { tp->snd_wl1 = seq; } /* * Calculate(/check) TCP checksum */ static inline __sum16 tcp_v4_check(int len, __be32 saddr, __be32 daddr, __wsum base) { return csum_tcpudp_magic(saddr, daddr, len, IPPROTO_TCP, base); } static inline bool tcp_checksum_complete(struct sk_buff *skb) { return !skb_csum_unnecessary(skb) && __skb_checksum_complete(skb); } bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason); int tcp_filter(struct sock *sk, struct sk_buff *skb); void tcp_set_state(struct sock *sk, int state); void tcp_done(struct sock *sk); int tcp_abort(struct sock *sk, int err); static inline void tcp_sack_reset(struct tcp_options_received *rx_opt) { rx_opt->dsack = 0; rx_opt->num_sacks = 0; } void tcp_cwnd_restart(struct sock *sk, s32 delta); static inline void tcp_slow_start_after_idle_check(struct sock *sk) { const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; struct tcp_sock *tp = tcp_sk(sk); s32 delta; if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle) || tp->packets_out || ca_ops->cong_control) return; delta = tcp_jiffies32 - tp->lsndtime; if (delta > inet_csk(sk)->icsk_rto) tcp_cwnd_restart(sk, delta); } /* Determine a window scaling and initial window to offer. */ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, __u32 *rcv_wnd, __u32 *window_clamp, int wscale_ok, __u8 *rcv_wscale, __u32 init_rcv_wnd); static inline int __tcp_win_from_space(u8 scaling_ratio, int space) { s64 scaled_space = (s64)space * scaling_ratio; return scaled_space >> TCP_RMEM_TO_WIN_SCALE; } static inline int tcp_win_from_space(const struct sock *sk, int space) { return __tcp_win_from_space(tcp_sk(sk)->scaling_ratio, space); } /* inverse of __tcp_win_from_space() */ static inline int __tcp_space_from_win(u8 scaling_ratio, int win) { u64 val = (u64)win << TCP_RMEM_TO_WIN_SCALE; do_div(val, scaling_ratio); return val; } static inline int tcp_space_from_win(const struct sock *sk, int win) { return __tcp_space_from_win(tcp_sk(sk)->scaling_ratio, win); } static inline void tcp_scaling_ratio_init(struct sock *sk) { /* Assume a conservative default of 1200 bytes of payload per 4K page. * This may be adjusted later in tcp_measure_rcv_mss(). */ tcp_sk(sk)->scaling_ratio = (1200 << TCP_RMEM_TO_WIN_SCALE) / SKB_TRUESIZE(4096); } /* Note: caller must be prepared to deal with negative returns */ static inline int tcp_space(const struct sock *sk) { return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - READ_ONCE(sk->sk_backlog.len) - atomic_read(&sk->sk_rmem_alloc)); } static inline int tcp_full_space(const struct sock *sk) { return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf)); } static inline void tcp_adjust_rcv_ssthresh(struct sock *sk) { int unused_mem = sk_unused_reserved_mem(sk); struct tcp_sock *tp = tcp_sk(sk); tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); if (unused_mem) tp->rcv_ssthresh = max_t(u32, tp->rcv_ssthresh, tcp_win_from_space(sk, unused_mem)); } void tcp_cleanup_rbuf(struct sock *sk, int copied); void __tcp_cleanup_rbuf(struct sock *sk, int copied); /* We provision sk_rcvbuf around 200% of sk_rcvlowat. * If 87.5 % (7/8) of the space has been consumed, we want to override * SO_RCVLOWAT constraint, since we are receiving skbs with too small * len/truesize ratio. */ static inline bool tcp_rmem_pressure(const struct sock *sk) { int rcvbuf, threshold; if (tcp_under_memory_pressure(sk)) return true; rcvbuf = READ_ONCE(sk->sk_rcvbuf); threshold = rcvbuf - (rcvbuf >> 3); return atomic_read(&sk->sk_rmem_alloc) > threshold; } static inline bool tcp_epollin_ready(const struct sock *sk, int target) { const struct tcp_sock *tp = tcp_sk(sk); int avail = READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq); if (avail <= 0) return false; return (avail >= target) || tcp_rmem_pressure(sk) || (tcp_receive_window(tp) <= inet_csk(sk)->icsk_ack.rcv_mss); } extern void tcp_openreq_init_rwin(struct request_sock *req, const struct sock *sk_listener, const struct dst_entry *dst); void tcp_enter_memory_pressure(struct sock *sk); void tcp_leave_memory_pressure(struct sock *sk); static inline int keepalive_intvl_when(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); int val; /* Paired with WRITE_ONCE() in tcp_sock_set_keepintvl() * and do_tcp_setsockopt(). */ val = READ_ONCE(tp->keepalive_intvl); return val ? : READ_ONCE(net->ipv4.sysctl_tcp_keepalive_intvl); } static inline int keepalive_time_when(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); int val; /* Paired with WRITE_ONCE() in tcp_sock_set_keepidle_locked() */ val = READ_ONCE(tp->keepalive_time); return val ? : READ_ONCE(net->ipv4.sysctl_tcp_keepalive_time); } static inline int keepalive_probes(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); int val; /* Paired with WRITE_ONCE() in tcp_sock_set_keepcnt() * and do_tcp_setsockopt(). */ val = READ_ONCE(tp->keepalive_probes); return val ? : READ_ONCE(net->ipv4.sysctl_tcp_keepalive_probes); } static inline u32 keepalive_time_elapsed(const struct tcp_sock *tp) { const struct inet_connection_sock *icsk = &tp->inet_conn; return min_t(u32, tcp_jiffies32 - icsk->icsk_ack.lrcvtime, tcp_jiffies32 - tp->rcv_tstamp); } static inline int tcp_fin_time(const struct sock *sk) { int fin_timeout = tcp_sk(sk)->linger2 ? : READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fin_timeout); const int rto = inet_csk(sk)->icsk_rto; if (fin_timeout < (rto << 2) - (rto >> 1)) fin_timeout = (rto << 2) - (rto >> 1); return fin_timeout; } static inline bool tcp_paws_check(const struct tcp_options_received *rx_opt, int paws_win) { if ((s32)(rx_opt->ts_recent - rx_opt->rcv_tsval) <= paws_win) return true; if (unlikely(!time_before32(ktime_get_seconds(), rx_opt->ts_recent_stamp + TCP_PAWS_24DAYS))) return true; /* * Some OSes send SYN and SYNACK messages with tsval=0 tsecr=0, * then following tcp messages have valid values. Ignore 0 value, * or else 'negative' tsval might forbid us to accept their packets. */ if (!rx_opt->ts_recent) return true; return false; } static inline bool tcp_paws_reject(const struct tcp_options_received *rx_opt, int rst) { if (tcp_paws_check(rx_opt, 0)) return false; /* RST segments are not recommended to carry timestamp, and, if they do, it is recommended to ignore PAWS because "their cleanup function should take precedence over timestamps." Certainly, it is mistake. It is necessary to understand the reasons of this constraint to relax it: if peer reboots, clock may go out-of-sync and half-open connections will not be reset. Actually, the problem would be not existing if all the implementations followed draft about maintaining clock via reboots. Linux-2.2 DOES NOT! However, we can relax time bounds for RST segments to MSL. */ if (rst && !time_before32(ktime_get_seconds(), rx_opt->ts_recent_stamp + TCP_PAWS_MSL)) return false; return true; } bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb, int mib_idx, u32 *last_oow_ack_time); static inline void tcp_mib_init(struct net *net) { /* See RFC 2012 */ TCP_ADD_STATS(net, TCP_MIB_RTOALGORITHM, 1); TCP_ADD_STATS(net, TCP_MIB_RTOMIN, TCP_RTO_MIN*1000/HZ); TCP_ADD_STATS(net, TCP_MIB_RTOMAX, TCP_RTO_MAX*1000/HZ); TCP_ADD_STATS(net, TCP_MIB_MAXCONN, -1); } /* from STCP */ static inline void tcp_clear_retrans_hints_partial(struct tcp_sock *tp) { tp->lost_skb_hint = NULL; } static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp) { tcp_clear_retrans_hints_partial(tp); tp->retransmit_skb_hint = NULL; } union tcp_md5_addr { struct in_addr a4; #if IS_ENABLED(CONFIG_IPV6) struct in6_addr a6; #endif }; /* - key database */ struct tcp_md5sig_key { struct hlist_node node; u8 keylen; u8 family; /* AF_INET or AF_INET6 */ u8 prefixlen; u8 flags; union tcp_md5_addr addr; int l3index; /* set if key added with L3 scope */ u8 key[TCP_MD5SIG_MAXKEYLEN]; struct rcu_head rcu; }; /* - sock block */ struct tcp_md5sig_info { struct hlist_head head; struct rcu_head rcu; }; /* - pseudo header */ struct tcp4_pseudohdr { __be32 saddr; __be32 daddr; __u8 pad; __u8 protocol; __be16 len; }; struct tcp6_pseudohdr { struct in6_addr saddr; struct in6_addr daddr; __be32 len; __be32 protocol; /* including padding */ }; union tcp_md5sum_block { struct tcp4_pseudohdr ip4; #if IS_ENABLED(CONFIG_IPV6) struct tcp6_pseudohdr ip6; #endif }; /* - pool: digest algorithm, hash description and scratch buffer */ struct tcp_md5sig_pool { struct ahash_request *md5_req; void *scratch; }; /* - functions */ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, const struct sock *sk, const struct sk_buff *skb); int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index, u8 flags, const u8 *newkey, u8 newkeylen); int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index, struct tcp_md5sig_key *key); int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index, u8 flags); struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, const struct sock *addr_sk); #ifdef CONFIG_TCP_MD5SIG #include <linux/jump_label.h> extern struct static_key_false_deferred tcp_md5_needed; struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, const union tcp_md5_addr *addr, int family); static inline struct tcp_md5sig_key * tcp_md5_do_lookup(const struct sock *sk, int l3index, const union tcp_md5_addr *addr, int family) { if (!static_branch_unlikely(&tcp_md5_needed.key)) return NULL; return __tcp_md5_do_lookup(sk, l3index, addr, family); } enum skb_drop_reason tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, const void *saddr, const void *daddr, int family, int dif, int sdif); #define tcp_twsk_md5_key(twsk) ((twsk)->tw_md5_key) #else static inline struct tcp_md5sig_key * tcp_md5_do_lookup(const struct sock *sk, int l3index, const union tcp_md5_addr *addr, int family) { return NULL; } static inline enum skb_drop_reason tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, const void *saddr, const void *daddr, int family, int dif, int sdif) { return SKB_NOT_DROPPED_YET; } #define tcp_twsk_md5_key(twsk) NULL #endif bool tcp_alloc_md5sig_pool(void); struct tcp_md5sig_pool *tcp_get_md5sig_pool(void); static inline void tcp_put_md5sig_pool(void) { local_bh_enable(); } int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *, const struct sk_buff *, unsigned int header_len); int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key); /* From tcp_fastopen.c */ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie); void tcp_fastopen_cache_set(struct sock *sk, u16 mss, struct tcp_fastopen_cookie *cookie, bool syn_lost, u16 try_exp); struct tcp_fastopen_request { /* Fast Open cookie. Size 0 means a cookie request */ struct tcp_fastopen_cookie cookie; struct msghdr *data; /* data in MSG_FASTOPEN */ size_t size; int copied; /* queued in tcp_connect() */ struct ubuf_info *uarg; }; void tcp_free_fastopen_req(struct tcp_sock *tp); void tcp_fastopen_destroy_cipher(struct sock *sk); void tcp_fastopen_ctx_destroy(struct net *net); int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk, void *primary_key, void *backup_key); int tcp_fastopen_get_cipher(struct net *net, struct inet_connection_sock *icsk, u64 *key); void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb); struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct tcp_fastopen_cookie *foc, const struct dst_entry *dst); void tcp_fastopen_init_key_once(struct net *net); bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie); bool tcp_fastopen_defer_connect(struct sock *sk, int *err); #define TCP_FASTOPEN_KEY_LENGTH sizeof(siphash_key_t) #define TCP_FASTOPEN_KEY_MAX 2 #define TCP_FASTOPEN_KEY_BUF_LENGTH \ (TCP_FASTOPEN_KEY_LENGTH * TCP_FASTOPEN_KEY_MAX) /* Fastopen key context */ struct tcp_fastopen_context { siphash_key_t key[TCP_FASTOPEN_KEY_MAX]; int num; struct rcu_head rcu; }; void tcp_fastopen_active_disable(struct sock *sk); bool tcp_fastopen_active_should_disable(struct sock *sk); void tcp_fastopen_active_disable_ofo_check(struct sock *sk); void tcp_fastopen_active_detect_blackhole(struct sock *sk, bool expired); /* Caller needs to wrap with rcu_read_(un)lock() */ static inline struct tcp_fastopen_context *tcp_fastopen_get_ctx(const struct sock *sk) { struct tcp_fastopen_context *ctx; ctx = rcu_dereference(inet_csk(sk)->icsk_accept_queue.fastopenq.ctx); if (!ctx) ctx = rcu_dereference(sock_net(sk)->ipv4.tcp_fastopen_ctx); return ctx; } static inline bool tcp_fastopen_cookie_match(const struct tcp_fastopen_cookie *foc, const struct tcp_fastopen_cookie *orig) { if (orig->len == TCP_FASTOPEN_COOKIE_SIZE && orig->len == foc->len && !memcmp(orig->val, foc->val, foc->len)) return true; return false; } static inline int tcp_fastopen_context_len(const struct tcp_fastopen_context *ctx) { return ctx->num; } /* Latencies incurred by various limits for a sender. They are * chronograph-like stats that are mutually exclusive. */ enum tcp_chrono { TCP_CHRONO_UNSPEC, TCP_CHRONO_BUSY, /* Actively sending data (non-empty write queue) */ TCP_CHRONO_RWND_LIMITED, /* Stalled by insufficient receive window */ TCP_CHRONO_SNDBUF_LIMITED, /* Stalled by insufficient send buffer */ __TCP_CHRONO_MAX, }; void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type); void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type); /* This helper is needed, because skb->tcp_tsorted_anchor uses * the same memory storage than skb->destructor/_skb_refdst */ static inline void tcp_skb_tsorted_anchor_cleanup(struct sk_buff *skb) { skb->destructor = NULL; skb->_skb_refdst = 0UL; } #define tcp_skb_tsorted_save(skb) { \ unsigned long _save = skb->_skb_refdst; \ skb->_skb_refdst = 0UL; #define tcp_skb_tsorted_restore(skb) \ skb->_skb_refdst = _save; \ } void tcp_write_queue_purge(struct sock *sk); static inline struct sk_buff *tcp_rtx_queue_head(const struct sock *sk) { return skb_rb_first(&sk->tcp_rtx_queue); } static inline struct sk_buff *tcp_rtx_queue_tail(const struct sock *sk) { return skb_rb_last(&sk->tcp_rtx_queue); } static inline struct sk_buff *tcp_write_queue_tail(const struct sock *sk) { return skb_peek_tail(&sk->sk_write_queue); } #define tcp_for_write_queue_from_safe(skb, tmp, sk) \ skb_queue_walk_from_safe(&(sk)->sk_write_queue, skb, tmp) static inline struct sk_buff *tcp_send_head(const struct sock *sk) { return skb_peek(&sk->sk_write_queue); } static inline bool tcp_skb_is_last(const struct sock *sk, const struct sk_buff *skb) { return skb_queue_is_last(&sk->sk_write_queue, skb); } /** * tcp_write_queue_empty - test if any payload (or FIN) is available in write queue * @sk: socket * * Since the write queue can have a temporary empty skb in it, * we must not use "return skb_queue_empty(&sk->sk_write_queue)" */ static inline bool tcp_write_queue_empty(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); return tp->write_seq == tp->snd_nxt; } static inline bool tcp_rtx_queue_empty(const struct sock *sk) { return RB_EMPTY_ROOT(&sk->tcp_rtx_queue); } static inline bool tcp_rtx_and_write_queues_empty(const struct sock *sk) { return tcp_rtx_queue_empty(sk) && tcp_write_queue_empty(sk); } static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb) { __skb_queue_tail(&sk->sk_write_queue, skb); /* Queue it, remembering where we must start sending. */ if (sk->sk_write_queue.next == skb) tcp_chrono_start(sk, TCP_CHRONO_BUSY); } /* Insert new before skb on the write queue of sk. */ static inline void tcp_insert_write_queue_before(struct sk_buff *new, struct sk_buff *skb, struct sock *sk) { __skb_queue_before(&sk->sk_write_queue, skb, new); } static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk) { tcp_skb_tsorted_anchor_cleanup(skb); __skb_unlink(skb, &sk->sk_write_queue); } void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb); static inline void tcp_rtx_queue_unlink(struct sk_buff *skb, struct sock *sk) { tcp_skb_tsorted_anchor_cleanup(skb); rb_erase(&skb->rbnode, &sk->tcp_rtx_queue); } static inline void tcp_rtx_queue_unlink_and_free(struct sk_buff *skb, struct sock *sk) { list_del(&skb->tcp_tsorted_anchor); tcp_rtx_queue_unlink(skb, sk); tcp_wmem_free_skb(sk, skb); } static inline void tcp_push_pending_frames(struct sock *sk) { if (tcp_send_head(sk)) { struct tcp_sock *tp = tcp_sk(sk); __tcp_push_pending_frames(sk, tcp_current_mss(sk), tp->nonagle); } } /* Start sequence of the skb just after the highest skb with SACKed * bit, valid only if sacked_out > 0 or when the caller has ensured * validity by itself. */ static inline u32 tcp_highest_sack_seq(struct tcp_sock *tp) { if (!tp->sacked_out) return tp->snd_una; if (tp->highest_sack == NULL) return tp->snd_nxt; return TCP_SKB_CB(tp->highest_sack)->seq; } static inline void tcp_advance_highest_sack(struct sock *sk, struct sk_buff *skb) { tcp_sk(sk)->highest_sack = skb_rb_next(skb); } static inline struct sk_buff *tcp_highest_sack(struct sock *sk) { return tcp_sk(sk)->highest_sack; } static inline void tcp_highest_sack_reset(struct sock *sk) { tcp_sk(sk)->highest_sack = tcp_rtx_queue_head(sk); } /* Called when old skb is about to be deleted and replaced by new skb */ static inline void tcp_highest_sack_replace(struct sock *sk, struct sk_buff *old, struct sk_buff *new) { if (old == tcp_highest_sack(sk)) tcp_sk(sk)->highest_sack = new; } /* This helper checks if socket has IP_TRANSPARENT set */ static inline bool inet_sk_transparent(const struct sock *sk) { switch (sk->sk_state) { case TCP_TIME_WAIT: return inet_twsk(sk)->tw_transparent; case TCP_NEW_SYN_RECV: return inet_rsk(inet_reqsk(sk))->no_srccheck; } return inet_test_bit(TRANSPARENT, sk); } /* Determines whether this is a thin stream (which may suffer from * increased latency). Used to trigger latency-reducing mechanisms. */ static inline bool tcp_stream_is_thin(struct tcp_sock *tp) { return tp->packets_out < 4 && !tcp_in_initial_slowstart(tp); } /* /proc */ enum tcp_seq_states { TCP_SEQ_STATE_LISTENING, TCP_SEQ_STATE_ESTABLISHED, }; void *tcp_seq_start(struct seq_file *seq, loff_t *pos); void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos); void tcp_seq_stop(struct seq_file *seq, void *v); struct tcp_seq_afinfo { sa_family_t family; }; struct tcp_iter_state { struct seq_net_private p; enum tcp_seq_states state; struct sock *syn_wait_sk; int bucket, offset, sbucket, num; loff_t last_pos; }; extern struct request_sock_ops tcp_request_sock_ops; extern struct request_sock_ops tcp6_request_sock_ops; void tcp_v4_destroy_sock(struct sock *sk); struct sk_buff *tcp_gso_segment(struct sk_buff *skb, netdev_features_t features); struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb); INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *skb, int thoff)); INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)); INDIRECT_CALLABLE_DECLARE(int tcp6_gro_complete(struct sk_buff *skb, int thoff)); INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb)); void tcp_gro_complete(struct sk_buff *skb); void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr); static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); u32 val; val = READ_ONCE(tp->notsent_lowat); return val ?: READ_ONCE(net->ipv4.sysctl_tcp_notsent_lowat); } bool tcp_stream_memory_free(const struct sock *sk, int wake); #ifdef CONFIG_PROC_FS int tcp4_proc_init(void); void tcp4_proc_exit(void); #endif int tcp_rtx_synack(const struct sock *sk, struct request_sock *req); int tcp_conn_request(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct sk_buff *skb); /* TCP af-specific functions */ struct tcp_sock_af_ops { #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *(*md5_lookup) (const struct sock *sk, const struct sock *addr_sk); int (*calc_md5_hash)(char *location, const struct tcp_md5sig_key *md5, const struct sock *sk, const struct sk_buff *skb); int (*md5_parse)(struct sock *sk, int optname, sockptr_t optval, int optlen); #endif }; struct tcp_request_sock_ops { u16 mss_clamp; #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *(*req_md5_lookup)(const struct sock *sk, const struct sock *addr_sk); int (*calc_md5_hash) (char *location, const struct tcp_md5sig_key *md5, const struct sock *sk, const struct sk_buff *skb); #endif #ifdef CONFIG_SYN_COOKIES __u32 (*cookie_init_seq)(const struct sk_buff *skb, __u16 *mss); #endif struct dst_entry *(*route_req)(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, struct request_sock *req); u32 (*init_seq)(const struct sk_buff *skb); u32 (*init_ts_off)(const struct net *net, const struct sk_buff *skb); int (*send_synack)(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type, struct sk_buff *syn_skb); }; extern const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops; #if IS_ENABLED(CONFIG_IPV6) extern const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops; #endif #ifdef CONFIG_SYN_COOKIES static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops, const struct sock *sk, struct sk_buff *skb, __u16 *mss) { tcp_synq_overflow(sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); return ops->cookie_init_seq(skb, mss); } #else static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops, const struct sock *sk, struct sk_buff *skb, __u16 *mss) { return 0; } #endif int tcpv4_offload_init(void); void tcp_v4_init(void); void tcp_init(void); /* tcp_recovery.c */ void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb); void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced); extern s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb, u32 reo_wnd); extern bool tcp_rack_mark_lost(struct sock *sk); extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, u64 xmit_time); extern void tcp_rack_reo_timeout(struct sock *sk); extern void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs); /* tcp_plb.c */ /* * Scaling factor for fractions in PLB. For example, tcp_plb_update_state * expects cong_ratio which represents fraction of traffic that experienced * congestion over a single RTT. In order to avoid floating point operations, * this fraction should be mapped to (1 << TCP_PLB_SCALE) and passed in. */ #define TCP_PLB_SCALE 8 /* State for PLB (Protective Load Balancing) for a single TCP connection. */ struct tcp_plb_state { u8 consec_cong_rounds:5, /* consecutive congested rounds */ unused:3; u32 pause_until; /* jiffies32 when PLB can resume rerouting */ }; static inline void tcp_plb_init(const struct sock *sk, struct tcp_plb_state *plb) { plb->consec_cong_rounds = 0; plb->pause_until = 0; } void tcp_plb_update_state(const struct sock *sk, struct tcp_plb_state *plb, const int cong_ratio); void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb); void tcp_plb_update_state_upon_rto(struct sock *sk, struct tcp_plb_state *plb); /* At how many usecs into the future should the RTO fire? */ static inline s64 tcp_rto_delta_us(const struct sock *sk) { const struct sk_buff *skb = tcp_rtx_queue_head(sk); u32 rto = inet_csk(sk)->icsk_rto; u64 rto_time_stamp_us = tcp_skb_timestamp_us(skb) + jiffies_to_usecs(rto); return rto_time_stamp_us - tcp_sk(sk)->tcp_mstamp; } /* * Save and compile IPv4 options, return a pointer to it */ static inline struct ip_options_rcu *tcp_v4_save_options(struct net *net, struct sk_buff *skb) { const struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; struct ip_options_rcu *dopt = NULL; if (opt->optlen) { int opt_size = sizeof(*dopt) + opt->optlen; dopt = kmalloc(opt_size, GFP_ATOMIC); if (dopt && __ip_options_echo(net, &dopt->opt, skb, opt)) { kfree(dopt); dopt = NULL; } } return dopt; } /* locally generated TCP pure ACKs have skb->truesize == 2 * (check tcp_send_ack() in net/ipv4/tcp_output.c ) * This is much faster than dissecting the packet to find out. * (Think of GRE encapsulations, IPv4, IPv6, ...) */ static inline bool skb_is_tcp_pure_ack(const struct sk_buff *skb) { return skb->truesize == 2; } static inline void skb_set_tcp_pure_ack(struct sk_buff *skb) { skb->truesize = 2; } static inline int tcp_inq(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); int answ; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { answ = 0; } else if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data || before(tp->urg_seq, tp->copied_seq) || !before(tp->urg_seq, tp->rcv_nxt)) { answ = tp->rcv_nxt - tp->copied_seq; /* Subtract 1, if FIN was received */ if (answ && sock_flag(sk, SOCK_DONE)) answ--; } else { answ = tp->urg_seq - tp->copied_seq; } return answ; } int tcp_peek_len(struct socket *sock); static inline void tcp_segs_in(struct tcp_sock *tp, const struct sk_buff *skb) { u16 segs_in; segs_in = max_t(u16, 1, skb_shinfo(skb)->gso_segs); /* We update these fields while other threads might * read them from tcp_get_info() */ WRITE_ONCE(tp->segs_in, tp->segs_in + segs_in); if (skb->len > tcp_hdrlen(skb)) WRITE_ONCE(tp->data_segs_in, tp->data_segs_in + segs_in); } /* * TCP listen path runs lockless. * We forced "struct sock" to be const qualified to make sure * we don't modify one of its field by mistake. * Here, we increment sk_drops which is an atomic_t, so we can safely * make sock writable again. */ static inline void tcp_listendrop(const struct sock *sk) { atomic_inc(&((struct sock *)sk)->sk_drops); __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS); } enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer); /* * Interface for adding Upper Level Protocols over TCP */ #define TCP_ULP_NAME_MAX 16 #define TCP_ULP_MAX 128 #define TCP_ULP_BUF_MAX (TCP_ULP_NAME_MAX*TCP_ULP_MAX) struct tcp_ulp_ops { struct list_head list; /* initialize ulp */ int (*init)(struct sock *sk); /* update ulp */ void (*update)(struct sock *sk, struct proto *p, void (*write_space)(struct sock *sk)); /* cleanup ulp */ void (*release)(struct sock *sk); /* diagnostic */ int (*get_info)(const struct sock *sk, struct sk_buff *skb); size_t (*get_info_size)(const struct sock *sk); /* clone ulp */ void (*clone)(const struct request_sock *req, struct sock *newsk, const gfp_t priority); char name[TCP_ULP_NAME_MAX]; struct module *owner; }; int tcp_register_ulp(struct tcp_ulp_ops *type); void tcp_unregister_ulp(struct tcp_ulp_ops *type); int tcp_set_ulp(struct sock *sk, const char *name); void tcp_get_available_ulp(char *buf, size_t len); void tcp_cleanup_ulp(struct sock *sk); void tcp_update_ulp(struct sock *sk, struct proto *p, void (*write_space)(struct sock *sk)); #define MODULE_ALIAS_TCP_ULP(name) \ __MODULE_INFO(alias, alias_userspace, name); \ __MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name) #ifdef CONFIG_NET_SOCK_MSG struct sk_msg; struct sk_psock; #ifdef CONFIG_BPF_SYSCALL int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); void tcp_bpf_clone(const struct sock *sk, struct sock *newsk); #endif /* CONFIG_BPF_SYSCALL */ #ifdef CONFIG_INET void tcp_eat_skb(struct sock *sk, struct sk_buff *skb); #else static inline void tcp_eat_skb(struct sock *sk, struct sk_buff *skb) { } #endif int tcp_bpf_sendmsg_redir(struct sock *sk, bool ingress, struct sk_msg *msg, u32 bytes, int flags); #endif /* CONFIG_NET_SOCK_MSG */ #if !defined(CONFIG_BPF_SYSCALL) || !defined(CONFIG_NET_SOCK_MSG) static inline void tcp_bpf_clone(const struct sock *sk, struct sock *newsk) { } #endif #ifdef CONFIG_CGROUP_BPF static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops, struct sk_buff *skb, unsigned int end_offset) { skops->skb = skb; skops->skb_data_end = skb->data + end_offset; } #else static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops, struct sk_buff *skb, unsigned int end_offset) { } #endif /* Call BPF_SOCK_OPS program that returns an int. If the return value * is < 0, then the BPF op failed (for example if the loaded BPF * program does not support the chosen operation or there is no BPF * program loaded). */ #ifdef CONFIG_BPF static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args) { struct bpf_sock_ops_kern sock_ops; int ret; memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); if (sk_fullsock(sk)) { sock_ops.is_fullsock = 1; sock_owned_by_me(sk); } sock_ops.sk = sk; sock_ops.op = op; if (nargs > 0) memcpy(sock_ops.args, args, nargs * sizeof(*args)); ret = BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); if (ret == 0) ret = sock_ops.reply; else ret = -1; return ret; } static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2) { u32 args[2] = {arg1, arg2}; return tcp_call_bpf(sk, op, 2, args); } static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2, u32 arg3) { u32 args[3] = {arg1, arg2, arg3}; return tcp_call_bpf(sk, op, 3, args); } #else static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args) { return -EPERM; } static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2) { return -EPERM; } static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2, u32 arg3) { return -EPERM; } #endif static inline u32 tcp_timeout_init(struct sock *sk) { int timeout; timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT, 0, NULL); if (timeout <= 0) timeout = TCP_TIMEOUT_INIT; return min_t(int, timeout, TCP_RTO_MAX); } static inline u32 tcp_rwnd_init_bpf(struct sock *sk) { int rwnd; rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT, 0, NULL); if (rwnd < 0) rwnd = 0; return rwnd; } static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk) { return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1); } static inline void tcp_bpf_rtt(struct sock *sk) { if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_RTT_CB_FLAG)) tcp_call_bpf(sk, BPF_SOCK_OPS_RTT_CB, 0, NULL); } #if IS_ENABLED(CONFIG_SMC) extern struct static_key_false tcp_have_smc; #endif #if IS_ENABLED(CONFIG_TLS_DEVICE) void clean_acked_data_enable(struct inet_connection_sock *icsk, void (*cad)(struct sock *sk, u32 ack_seq)); void clean_acked_data_disable(struct inet_connection_sock *icsk); void clean_acked_data_flush(void); #endif DECLARE_STATIC_KEY_FALSE(tcp_tx_delay_enabled); static inline void tcp_add_tx_delay(struct sk_buff *skb, const struct tcp_sock *tp) { if (static_branch_unlikely(&tcp_tx_delay_enabled)) skb->skb_mstamp_ns += (u64)tp->tcp_tx_delay * NSEC_PER_USEC; } /* Compute Earliest Departure Time for some control packets * like ACK or RST for TIME_WAIT or non ESTABLISHED sockets. */ static inline u64 tcp_transmit_time(const struct sock *sk) { if (static_branch_unlikely(&tcp_tx_delay_enabled)) { u32 delay = (sk->sk_state == TCP_TIME_WAIT) ? tcp_twsk(sk)->tw_tx_delay : tcp_sk(sk)->tcp_tx_delay; return tcp_clock_ns() + (u64)delay * NSEC_PER_USEC; } return 0; } #endif /* _TCP_H */ |
331 2 535 148 219 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * net busy poll support * Copyright(c) 2013 Intel Corporation. * * Author: Eliezer Tamir * * Contact Information: * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net> */ #ifndef _LINUX_NET_BUSY_POLL_H #define _LINUX_NET_BUSY_POLL_H #include <linux/netdevice.h> #include <linux/sched/clock.h> #include <linux/sched/signal.h> #include <net/ip.h> #include <net/xdp.h> /* 0 - Reserved to indicate value not set * 1..NR_CPUS - Reserved for sender_cpu * NR_CPUS+1..~0 - Region available for NAPI IDs */ #define MIN_NAPI_ID ((unsigned int)(NR_CPUS + 1)) #define BUSY_POLL_BUDGET 8 #ifdef CONFIG_NET_RX_BUSY_POLL struct napi_struct; extern unsigned int sysctl_net_busy_read __read_mostly; extern unsigned int sysctl_net_busy_poll __read_mostly; static inline bool net_busy_loop_on(void) { return READ_ONCE(sysctl_net_busy_poll); } static inline bool sk_can_busy_loop(const struct sock *sk) { return READ_ONCE(sk->sk_ll_usec) && !signal_pending(current); } bool sk_busy_loop_end(void *p, unsigned long start_time); void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), void *loop_end_arg, bool prefer_busy_poll, u16 budget); #else /* CONFIG_NET_RX_BUSY_POLL */ static inline unsigned long net_busy_loop_on(void) { return 0; } static inline bool sk_can_busy_loop(struct sock *sk) { return false; } #endif /* CONFIG_NET_RX_BUSY_POLL */ static inline unsigned long busy_loop_current_time(void) { #ifdef CONFIG_NET_RX_BUSY_POLL return (unsigned long)(local_clock() >> 10); #else return 0; #endif } /* in poll/select we use the global sysctl_net_ll_poll value */ static inline bool busy_loop_timeout(unsigned long start_time) { #ifdef CONFIG_NET_RX_BUSY_POLL unsigned long bp_usec = READ_ONCE(sysctl_net_busy_poll); if (bp_usec) { unsigned long end_time = start_time + bp_usec; unsigned long now = busy_loop_current_time(); return time_after(now, end_time); } #endif return true; } static inline bool sk_busy_loop_timeout(struct sock *sk, unsigned long start_time) { #ifdef CONFIG_NET_RX_BUSY_POLL unsigned long bp_usec = READ_ONCE(sk->sk_ll_usec); if (bp_usec) { unsigned long end_time = start_time + bp_usec; unsigned long now = busy_loop_current_time(); return time_after(now, end_time); } #endif return true; } static inline void sk_busy_loop(struct sock *sk, int nonblock) { #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int napi_id = READ_ONCE(sk->sk_napi_id); if (napi_id >= MIN_NAPI_ID) napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk, READ_ONCE(sk->sk_prefer_busy_poll), READ_ONCE(sk->sk_busy_poll_budget) ?: BUSY_POLL_BUDGET); #endif } /* used in the NIC receive handler to mark the skb */ static inline void skb_mark_napi_id(struct sk_buff *skb, struct napi_struct *napi) { #ifdef CONFIG_NET_RX_BUSY_POLL /* If the skb was already marked with a valid NAPI ID, avoid overwriting * it. */ if (skb->napi_id < MIN_NAPI_ID) skb->napi_id = napi->napi_id; #endif } /* used in the protocol hanlder to propagate the napi_id to the socket */ static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb) { #ifdef CONFIG_NET_RX_BUSY_POLL if (unlikely(READ_ONCE(sk->sk_napi_id) != skb->napi_id)) WRITE_ONCE(sk->sk_napi_id, skb->napi_id); #endif sk_rx_queue_update(sk, skb); } /* Variant of sk_mark_napi_id() for passive flow setup, * as sk->sk_napi_id and sk->sk_rx_queue_mapping content * needs to be set. */ static inline void sk_mark_napi_id_set(struct sock *sk, const struct sk_buff *skb) { #ifdef CONFIG_NET_RX_BUSY_POLL WRITE_ONCE(sk->sk_napi_id, skb->napi_id); #endif sk_rx_queue_set(sk, skb); } static inline void __sk_mark_napi_id_once(struct sock *sk, unsigned int napi_id) { #ifdef CONFIG_NET_RX_BUSY_POLL if (!READ_ONCE(sk->sk_napi_id)) WRITE_ONCE(sk->sk_napi_id, napi_id); #endif } /* variant used for unconnected sockets */ static inline void sk_mark_napi_id_once(struct sock *sk, const struct sk_buff *skb) { #ifdef CONFIG_NET_RX_BUSY_POLL __sk_mark_napi_id_once(sk, skb->napi_id); #endif } static inline void sk_mark_napi_id_once_xdp(struct sock *sk, const struct xdp_buff *xdp) { #ifdef CONFIG_NET_RX_BUSY_POLL __sk_mark_napi_id_once(sk, xdp->rxq->napi_id); #endif } #endif /* _LINUX_NET_BUSY_POLL_H */ |
2270 2270 2270 57 57 57 57 57 57 57 57 57 57 57 57 57 57 1 1 57 57 57 1 57 57 115 115 115 115 115 115 115 112 112 112 112 112 115 115 109 109 115 115 115 115 13 13 5 13 13 13 13 2111 2111 2111 2111 2111 2245 2245 2245 2220 2220 2245 2245 2245 2245 2245 2245 30 2220 2245 2245 2245 2245 2245 2220 2220 2220 2245 2220 30 30 2220 115 115 2245 2245 2245 2245 2245 2245 2245 2245 2245 2245 2245 2220 2245 2245 316 316 2245 2245 2245 2245 2245 2245 2244 2245 2245 2245 2245 2245 115 115 2245 2245 2245 2245 115 2245 2245 2245 115 2245 2245 2244 2245 2245 2245 2245 2245 2245 2245 2245 2245 2245 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/ialloc.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * BSD ufs-inspired inode and directory allocation by * Stephen Tweedie (sct@redhat.com), 1993 * Big-endian to little-endian byte-swapping/bitmaps by * David S. Miller (davem@caip.rutgers.edu), 1995 */ #include <linux/time.h> #include <linux/fs.h> #include <linux/stat.h> #include <linux/string.h> #include <linux/quotaops.h> #include <linux/buffer_head.h> #include <linux/random.h> #include <linux/bitops.h> #include <linux/blkdev.h> #include <linux/cred.h> #include <asm/byteorder.h> #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" #include <trace/events/ext4.h> /* * ialloc.c contains the inodes allocation and deallocation routines */ /* * The free inodes are managed by bitmaps. A file system contains several * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap * block for inodes, N blocks for the inode table and data blocks. * * The file system contains group descriptors which are located after the * super block. Each descriptor contains the number of the bitmap block and * the free blocks count in the block. */ /* * To avoid calling the atomic setbit hundreds or thousands of times, we only * need to use it within a single byte (to ensure we get endianness right). * We can use memset for the rest of the bitmap as there are no other users. */ void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap) { int i; if (start_bit >= end_bit) return; ext4_debug("mark end bits +%d through +%d used\n", start_bit, end_bit); for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++) ext4_set_bit(i, bitmap); if (i < end_bit) memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3); } void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate) { if (uptodate) { set_buffer_uptodate(bh); set_bitmap_uptodate(bh); } unlock_buffer(bh); put_bh(bh); } static int ext4_validate_inode_bitmap(struct super_block *sb, struct ext4_group_desc *desc, ext4_group_t block_group, struct buffer_head *bh) { ext4_fsblk_t blk; struct ext4_group_info *grp; if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) return 0; grp = ext4_get_group_info(sb, block_group); if (buffer_verified(bh)) return 0; if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) return -EFSCORRUPTED; ext4_lock_group(sb, block_group); if (buffer_verified(bh)) goto verified; blk = ext4_inode_bitmap(sb, desc); if (!ext4_inode_bitmap_csum_verify(sb, desc, bh, EXT4_INODES_PER_GROUP(sb) / 8) || ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_CRC)) { ext4_unlock_group(sb, block_group); ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " "inode_bitmap = %llu", block_group, blk); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_IBITMAP_CORRUPT); return -EFSBADCRC; } set_buffer_verified(bh); verified: ext4_unlock_group(sb, block_group); return 0; } /* * Read the inode allocation bitmap for a given block_group, reading * into the specified slot in the superblock's bitmap cache. * * Return buffer_head of bitmap on success, or an ERR_PTR on error. */ static struct buffer_head * ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) { struct ext4_group_desc *desc; struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bh = NULL; ext4_fsblk_t bitmap_blk; int err; desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) return ERR_PTR(-EFSCORRUPTED); bitmap_blk = ext4_inode_bitmap(sb, desc); if ((bitmap_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || (bitmap_blk >= ext4_blocks_count(sbi->s_es))) { ext4_error(sb, "Invalid inode bitmap blk %llu in " "block_group %u", bitmap_blk, block_group); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_IBITMAP_CORRUPT); return ERR_PTR(-EFSCORRUPTED); } bh = sb_getblk(sb, bitmap_blk); if (unlikely(!bh)) { ext4_warning(sb, "Cannot read inode bitmap - " "block_group = %u, inode_bitmap = %llu", block_group, bitmap_blk); return ERR_PTR(-ENOMEM); } if (bitmap_uptodate(bh)) goto verify; lock_buffer(bh); if (bitmap_uptodate(bh)) { unlock_buffer(bh); goto verify; } ext4_lock_group(sb, block_group); if (ext4_has_group_desc_csum(sb) && (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) { if (block_group == 0) { ext4_unlock_group(sb, block_group); unlock_buffer(bh); ext4_error(sb, "Inode bitmap for bg 0 marked " "uninitialized"); err = -EFSCORRUPTED; goto out; } memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, bh->b_data); set_bitmap_uptodate(bh); set_buffer_uptodate(bh); set_buffer_verified(bh); ext4_unlock_group(sb, block_group); unlock_buffer(bh); return bh; } ext4_unlock_group(sb, block_group); if (buffer_uptodate(bh)) { /* * if not uninit if bh is uptodate, * bitmap is also uptodate */ set_bitmap_uptodate(bh); unlock_buffer(bh); goto verify; } /* * submit the buffer_head for reading */ trace_ext4_load_inode_bitmap(sb, block_group); ext4_read_bh(bh, REQ_META | REQ_PRIO, ext4_end_bitmap_read); ext4_simulate_fail_bh(sb, bh, EXT4_SIM_IBITMAP_EIO); if (!buffer_uptodate(bh)) { put_bh(bh); ext4_error_err(sb, EIO, "Cannot read inode bitmap - " "block_group = %u, inode_bitmap = %llu", block_group, bitmap_blk); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_IBITMAP_CORRUPT); return ERR_PTR(-EIO); } verify: err = ext4_validate_inode_bitmap(sb, desc, block_group, bh); if (err) goto out; return bh; out: put_bh(bh); return ERR_PTR(err); } /* * NOTE! When we get the inode, we're the only people * that have access to it, and as such there are no * race conditions we have to worry about. The inode * is not on the hash-lists, and it cannot be reached * through the filesystem because the directory entry * has been deleted earlier. * * HOWEVER: we must make sure that we get no aliases, * which means that we have to call "clear_inode()" * _before_ we mark the inode not in use in the inode * bitmaps. Otherwise a newly created file might use * the same inode number (not actually the same pointer * though), and then we'd have two inodes sharing the * same inode number and space on the harddisk. */ void ext4_free_inode(handle_t *handle, struct inode *inode) { struct super_block *sb = inode->i_sb; int is_directory; unsigned long ino; struct buffer_head *bitmap_bh = NULL; struct buffer_head *bh2; ext4_group_t block_group; unsigned long bit; struct ext4_group_desc *gdp; struct ext4_super_block *es; struct ext4_sb_info *sbi; int fatal = 0, err, count, cleared; struct ext4_group_info *grp; if (!sb) { printk(KERN_ERR "EXT4-fs: %s:%d: inode on " "nonexistent device\n", __func__, __LINE__); return; } if (atomic_read(&inode->i_count) > 1) { ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d", __func__, __LINE__, inode->i_ino, atomic_read(&inode->i_count)); return; } if (inode->i_nlink) { ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: nlink=%d\n", __func__, __LINE__, inode->i_ino, inode->i_nlink); return; } sbi = EXT4_SB(sb); ino = inode->i_ino; ext4_debug("freeing inode %lu\n", ino); trace_ext4_free_inode(inode); dquot_initialize(inode); dquot_free_inode(inode); is_directory = S_ISDIR(inode->i_mode); /* Do this BEFORE marking the inode not in use or returning an error */ ext4_clear_inode(inode); es = sbi->s_es; if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { ext4_error(sb, "reserved or nonexistent inode %lu", ino); goto error_return; } block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); bitmap_bh = ext4_read_inode_bitmap(sb, block_group); /* Don't bother if the inode bitmap is corrupt. */ if (IS_ERR(bitmap_bh)) { fatal = PTR_ERR(bitmap_bh); bitmap_bh = NULL; goto error_return; } if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) { grp = ext4_get_group_info(sb, block_group); if (!grp || unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) { fatal = -EFSCORRUPTED; goto error_return; } } BUFFER_TRACE(bitmap_bh, "get_write_access"); fatal = ext4_journal_get_write_access(handle, sb, bitmap_bh, EXT4_JTR_NONE); if (fatal) goto error_return; fatal = -ESRCH; gdp = ext4_get_group_desc(sb, block_group, &bh2); if (gdp) { BUFFER_TRACE(bh2, "get_write_access"); fatal = ext4_journal_get_write_access(handle, sb, bh2, EXT4_JTR_NONE); } ext4_lock_group(sb, block_group); cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data); if (fatal || !cleared) { ext4_unlock_group(sb, block_group); goto out; } count = ext4_free_inodes_count(sb, gdp) + 1; ext4_free_inodes_set(sb, gdp, count); if (is_directory) { count = ext4_used_dirs_count(sb, gdp) - 1; ext4_used_dirs_set(sb, gdp, count); if (percpu_counter_initialized(&sbi->s_dirs_counter)) percpu_counter_dec(&sbi->s_dirs_counter); } ext4_inode_bitmap_csum_set(sb, gdp, bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); ext4_group_desc_csum_set(sb, block_group, gdp); ext4_unlock_group(sb, block_group); if (percpu_counter_initialized(&sbi->s_freeinodes_counter)) percpu_counter_inc(&sbi->s_freeinodes_counter); if (sbi->s_log_groups_per_flex) { struct flex_groups *fg; fg = sbi_array_rcu_deref(sbi, s_flex_groups, ext4_flex_group(sbi, block_group)); atomic_inc(&fg->free_inodes); if (is_directory) atomic_dec(&fg->used_dirs); } BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata"); fatal = ext4_handle_dirty_metadata(handle, NULL, bh2); out: if (cleared) { BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (!fatal) fatal = err; } else { ext4_error(sb, "bit already cleared for inode %lu", ino); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_IBITMAP_CORRUPT); } error_return: brelse(bitmap_bh); ext4_std_error(sb, fatal); } struct orlov_stats { __u64 free_clusters; __u32 free_inodes; __u32 used_dirs; }; /* * Helper function for Orlov's allocator; returns critical information * for a particular block group or flex_bg. If flex_size is 1, then g * is a block group number; otherwise it is flex_bg number. */ static void get_orlov_stats(struct super_block *sb, ext4_group_t g, int flex_size, struct orlov_stats *stats) { struct ext4_group_desc *desc; if (flex_size > 1) { struct flex_groups *fg = sbi_array_rcu_deref(EXT4_SB(sb), s_flex_groups, g); stats->free_inodes = atomic_read(&fg->free_inodes); stats->free_clusters = atomic64_read(&fg->free_clusters); stats->used_dirs = atomic_read(&fg->used_dirs); return; } desc = ext4_get_group_desc(sb, g, NULL); if (desc) { stats->free_inodes = ext4_free_inodes_count(sb, desc); stats->free_clusters = ext4_free_group_clusters(sb, desc); stats->used_dirs = ext4_used_dirs_count(sb, desc); } else { stats->free_inodes = 0; stats->free_clusters = 0; stats->used_dirs = 0; } } /* * Orlov's allocator for directories. * * We always try to spread first-level directories. * * If there are blockgroups with both free inodes and free clusters counts * not worse than average we return one with smallest directory count. * Otherwise we simply return a random group. * * For the rest rules look so: * * It's OK to put directory into a group unless * it has too many directories already (max_dirs) or * it has too few free inodes left (min_inodes) or * it has too few free clusters left (min_clusters) or * Parent's group is preferred, if it doesn't satisfy these * conditions we search cyclically through the rest. If none * of the groups look good we just look for a group with more * free inodes than average (starting at parent's group). */ static int find_group_orlov(struct super_block *sb, struct inode *parent, ext4_group_t *group, umode_t mode, const struct qstr *qstr) { ext4_group_t parent_group = EXT4_I(parent)->i_block_group; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t real_ngroups = ext4_get_groups_count(sb); int inodes_per_group = EXT4_INODES_PER_GROUP(sb); unsigned int freei, avefreei, grp_free; ext4_fsblk_t freec, avefreec; unsigned int ndirs; int max_dirs, min_inodes; ext4_grpblk_t min_clusters; ext4_group_t i, grp, g, ngroups; struct ext4_group_desc *desc; struct orlov_stats stats; int flex_size = ext4_flex_bg_size(sbi); struct dx_hash_info hinfo; ngroups = real_ngroups; if (flex_size > 1) { ngroups = (real_ngroups + flex_size - 1) >> sbi->s_log_groups_per_flex; parent_group >>= sbi->s_log_groups_per_flex; } freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); avefreei = freei / ngroups; freec = percpu_counter_read_positive(&sbi->s_freeclusters_counter); avefreec = freec; do_div(avefreec, ngroups); ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); if (S_ISDIR(mode) && ((parent == d_inode(sb->s_root)) || (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) { int best_ndir = inodes_per_group; int ret = -1; if (qstr) { hinfo.hash_version = DX_HASH_HALF_MD4; hinfo.seed = sbi->s_hash_seed; ext4fs_dirhash(parent, qstr->name, qstr->len, &hinfo); parent_group = hinfo.hash % ngroups; } else parent_group = get_random_u32_below(ngroups); for (i = 0; i < ngroups; i++) { g = (parent_group + i) % ngroups; get_orlov_stats(sb, g, flex_size, &stats); if (!stats.free_inodes) continue; if (stats.used_dirs >= best_ndir) continue; if (stats.free_inodes < avefreei) continue; if (stats.free_clusters < avefreec) continue; grp = g; ret = 0; best_ndir = stats.used_dirs; } if (ret) goto fallback; found_flex_bg: if (flex_size == 1) { *group = grp; return 0; } /* * We pack inodes at the beginning of the flexgroup's * inode tables. Block allocation decisions will do * something similar, although regular files will * start at 2nd block group of the flexgroup. See * ext4_ext_find_goal() and ext4_find_near(). */ grp *= flex_size; for (i = 0; i < flex_size; i++) { if (grp+i >= real_ngroups) break; desc = ext4_get_group_desc(sb, grp+i, NULL); if (desc && ext4_free_inodes_count(sb, desc)) { *group = grp+i; return 0; } } goto fallback; } max_dirs = ndirs / ngroups + inodes_per_group*flex_size / 16; min_inodes = avefreei - inodes_per_group*flex_size / 4; if (min_inodes < 1) min_inodes = 1; min_clusters = avefreec - EXT4_CLUSTERS_PER_GROUP(sb)*flex_size / 4; /* * Start looking in the flex group where we last allocated an * inode for this parent directory */ if (EXT4_I(parent)->i_last_alloc_group != ~0) { parent_group = EXT4_I(parent)->i_last_alloc_group; if (flex_size > 1) parent_group >>= sbi->s_log_groups_per_flex; } for (i = 0; i < ngroups; i++) { grp = (parent_group + i) % ngroups; get_orlov_stats(sb, grp, flex_size, &stats); if (stats.used_dirs >= max_dirs) continue; if (stats.free_inodes < min_inodes) continue; if (stats.free_clusters < min_clusters) continue; goto found_flex_bg; } fallback: ngroups = real_ngroups; avefreei = freei / ngroups; fallback_retry: parent_group = EXT4_I(parent)->i_block_group; for (i = 0; i < ngroups; i++) { grp = (parent_group + i) % ngroups; desc = ext4_get_group_desc(sb, grp, NULL); if (desc) { grp_free = ext4_free_inodes_count(sb, desc); if (grp_free && grp_free >= avefreei) { *group = grp; return 0; } } } if (avefreei) { /* * The free-inodes counter is approximate, and for really small * filesystems the above test can fail to find any blockgroups */ avefreei = 0; goto fallback_retry; } return -1; } static int find_group_other(struct super_block *sb, struct inode *parent, ext4_group_t *group, umode_t mode) { ext4_group_t parent_group = EXT4_I(parent)->i_block_group; ext4_group_t i, last, ngroups = ext4_get_groups_count(sb); struct ext4_group_desc *desc; int flex_size = ext4_flex_bg_size(EXT4_SB(sb)); /* * Try to place the inode is the same flex group as its * parent. If we can't find space, use the Orlov algorithm to * find another flex group, and store that information in the * parent directory's inode information so that use that flex * group for future allocations. */ if (flex_size > 1) { int retry = 0; try_again: parent_group &= ~(flex_size-1); last = parent_group + flex_size; if (last > ngroups) last = ngroups; for (i = parent_group; i < last; i++) { desc = ext4_get_group_desc(sb, i, NULL); if (desc && ext4_free_inodes_count(sb, desc)) { *group = i; return 0; } } if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) { retry = 1; parent_group = EXT4_I(parent)->i_last_alloc_group; goto try_again; } /* * If this didn't work, use the Orlov search algorithm * to find a new flex group; we pass in the mode to * avoid the topdir algorithms. */ *group = parent_group + flex_size; if (*group > ngroups) *group = 0; return find_group_orlov(sb, parent, group, mode, NULL); } /* * Try to place the inode in its parent directory */ *group = parent_group; desc = ext4_get_group_desc(sb, *group, NULL); if (desc && ext4_free_inodes_count(sb, desc) && ext4_free_group_clusters(sb, desc)) return 0; /* * We're going to place this inode in a different blockgroup from its * parent. We want to cause files in a common directory to all land in * the same blockgroup. But we want files which are in a different * directory which shares a blockgroup with our parent to land in a * different blockgroup. * * So add our directory's i_ino into the starting point for the hash. */ *group = (*group + parent->i_ino) % ngroups; /* * Use a quadratic hash to find a group with a free inode and some free * blocks. */ for (i = 1; i < ngroups; i <<= 1) { *group += i; if (*group >= ngroups) *group -= ngroups; desc = ext4_get_group_desc(sb, *group, NULL); if (desc && ext4_free_inodes_count(sb, desc) && ext4_free_group_clusters(sb, desc)) return 0; } /* * That failed: try linear search for a free inode, even if that group * has no free blocks. */ *group = parent_group; for (i = 0; i < ngroups; i++) { if (++*group >= ngroups) *group = 0; desc = ext4_get_group_desc(sb, *group, NULL); if (desc && ext4_free_inodes_count(sb, desc)) return 0; } return -1; } /* * In no journal mode, if an inode has recently been deleted, we want * to avoid reusing it until we're reasonably sure the inode table * block has been written back to disk. (Yes, these values are * somewhat arbitrary...) */ #define RECENTCY_MIN 60 #define RECENTCY_DIRTY 300 static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino) { struct ext4_group_desc *gdp; struct ext4_inode *raw_inode; struct buffer_head *bh; int inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; int offset, ret = 0; int recentcy = RECENTCY_MIN; u32 dtime, now; gdp = ext4_get_group_desc(sb, group, NULL); if (unlikely(!gdp)) return 0; bh = sb_find_get_block(sb, ext4_inode_table(sb, gdp) + (ino / inodes_per_block)); if (!bh || !buffer_uptodate(bh)) /* * If the block is not in the buffer cache, then it * must have been written out. */ goto out; offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb); raw_inode = (struct ext4_inode *) (bh->b_data + offset); /* i_dtime is only 32 bits on disk, but we only care about relative * times in the range of a few minutes (i.e. long enough to sync a * recently-deleted inode to disk), so using the low 32 bits of the * clock (a 68 year range) is enough, see time_before32() */ dtime = le32_to_cpu(raw_inode->i_dtime); now = ktime_get_real_seconds(); if (buffer_dirty(bh)) recentcy += RECENTCY_DIRTY; if (dtime && time_before32(dtime, now) && time_before32(now, dtime + recentcy)) ret = 1; out: brelse(bh); return ret; } static int find_inode_bit(struct super_block *sb, ext4_group_t group, struct buffer_head *bitmap, unsigned long *ino) { bool check_recently_deleted = EXT4_SB(sb)->s_journal == NULL; unsigned long recently_deleted_ino = EXT4_INODES_PER_GROUP(sb); next: *ino = ext4_find_next_zero_bit((unsigned long *) bitmap->b_data, EXT4_INODES_PER_GROUP(sb), *ino); if (*ino >= EXT4_INODES_PER_GROUP(sb)) goto not_found; if (check_recently_deleted && recently_deleted(sb, group, *ino)) { recently_deleted_ino = *ino; *ino = *ino + 1; if (*ino < EXT4_INODES_PER_GROUP(sb)) goto next; goto not_found; } return 1; not_found: if (recently_deleted_ino >= EXT4_INODES_PER_GROUP(sb)) return 0; /* * Not reusing recently deleted inodes is mostly a preference. We don't * want to report ENOSPC or skew allocation patterns because of that. * So return even recently deleted inode if we could find better in the * given range. */ *ino = recently_deleted_ino; return 1; } int ext4_mark_inode_used(struct super_block *sb, int ino) { unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count); struct buffer_head *inode_bitmap_bh = NULL, *group_desc_bh = NULL; struct ext4_group_desc *gdp; ext4_group_t group; int bit; int err = -EFSCORRUPTED; if (ino < EXT4_FIRST_INO(sb) || ino > max_ino) goto out; group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); if (IS_ERR(inode_bitmap_bh)) return PTR_ERR(inode_bitmap_bh); if (ext4_test_bit(bit, inode_bitmap_bh->b_data)) { err = 0; goto out; } gdp = ext4_get_group_desc(sb, group, &group_desc_bh); if (!gdp || !group_desc_bh) { err = -EINVAL; goto out; } ext4_set_bit(bit, inode_bitmap_bh->b_data); BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(NULL, NULL, inode_bitmap_bh); if (err) { ext4_std_error(sb, err); goto out; } err = sync_dirty_buffer(inode_bitmap_bh); if (err) { ext4_std_error(sb, err); goto out; } /* We may have to initialize the block bitmap if it isn't already */ if (ext4_has_group_desc_csum(sb) && gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { struct buffer_head *block_bitmap_bh; block_bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(block_bitmap_bh)) { err = PTR_ERR(block_bitmap_bh); goto out; } BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); err = ext4_handle_dirty_metadata(NULL, NULL, block_bitmap_bh); sync_dirty_buffer(block_bitmap_bh); /* recheck and clear flag under lock if we still need to */ ext4_lock_group(sb, group); if (ext4_has_group_desc_csum(sb) && (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); ext4_free_group_clusters_set(sb, gdp, ext4_free_clusters_after_init(sb, group, gdp)); ext4_block_bitmap_csum_set(sb, gdp, block_bitmap_bh); ext4_group_desc_csum_set(sb, group, gdp); } ext4_unlock_group(sb, group); brelse(block_bitmap_bh); if (err) { ext4_std_error(sb, err); goto out; } } /* Update the relevant bg descriptor fields */ if (ext4_has_group_desc_csum(sb)) { int free; ext4_lock_group(sb, group); /* while we modify the bg desc */ free = EXT4_INODES_PER_GROUP(sb) - ext4_itable_unused_count(sb, gdp); if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); free = 0; } /* * Check the relative inode number against the last used * relative inode number in this group. if it is greater * we need to update the bg_itable_unused count */ if (bit >= free) ext4_itable_unused_set(sb, gdp, (EXT4_INODES_PER_GROUP(sb) - bit - 1)); } else { ext4_lock_group(sb, group); } ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1); if (ext4_has_group_desc_csum(sb)) { ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); ext4_group_desc_csum_set(sb, group, gdp); } ext4_unlock_group(sb, group); err = ext4_handle_dirty_metadata(NULL, NULL, group_desc_bh); sync_dirty_buffer(group_desc_bh); out: return err; } static int ext4_xattr_credits_for_new_inode(struct inode *dir, mode_t mode, bool encrypt) { struct super_block *sb = dir->i_sb; int nblocks = 0; #ifdef CONFIG_EXT4_FS_POSIX_ACL struct posix_acl *p = get_inode_acl(dir, ACL_TYPE_DEFAULT); if (IS_ERR(p)) return PTR_ERR(p); if (p) { int acl_size = p->a_count * sizeof(ext4_acl_entry); nblocks += (S_ISDIR(mode) ? 2 : 1) * __ext4_xattr_set_credits(sb, NULL /* inode */, NULL /* block_bh */, acl_size, true /* is_create */); posix_acl_release(p); } #endif #ifdef CONFIG_SECURITY { int num_security_xattrs = 1; #ifdef CONFIG_INTEGRITY num_security_xattrs++; #endif /* * We assume that security xattrs are never more than 1k. * In practice they are under 128 bytes. */ nblocks += num_security_xattrs * __ext4_xattr_set_credits(sb, NULL /* inode */, NULL /* block_bh */, 1024, true /* is_create */); } #endif if (encrypt) nblocks += __ext4_xattr_set_credits(sb, NULL /* inode */, NULL /* block_bh */, FSCRYPT_SET_CONTEXT_MAX_SIZE, true /* is_create */); return nblocks; } /* * There are two policies for allocating an inode. If the new inode is * a directory, then a forward search is made for a block group with both * free space and a low directory-to-inode ratio; if that fails, then of * the groups with above-average free space, that group with the fewest * directories already is chosen. * * For other inodes, search forward from the parent directory's block * group to find a free inode. */ struct inode *__ext4_new_inode(struct mnt_idmap *idmap, handle_t *handle, struct inode *dir, umode_t mode, const struct qstr *qstr, __u32 goal, uid_t *owner, __u32 i_flags, int handle_type, unsigned int line_no, int nblocks) { struct super_block *sb; struct buffer_head *inode_bitmap_bh = NULL; struct buffer_head *group_desc_bh; ext4_group_t ngroups, group = 0; unsigned long ino = 0; struct inode *inode; struct ext4_group_desc *gdp = NULL; struct ext4_inode_info *ei; struct ext4_sb_info *sbi; int ret2, err; struct inode *ret; ext4_group_t i; ext4_group_t flex_group; struct ext4_group_info *grp = NULL; bool encrypt = false; /* Cannot create files in a deleted directory */ if (!dir || !dir->i_nlink) return ERR_PTR(-EPERM); sb = dir->i_sb; sbi = EXT4_SB(sb); if (unlikely(ext4_forced_shutdown(sb))) return ERR_PTR(-EIO); ngroups = ext4_get_groups_count(sb); trace_ext4_request_inode(dir, mode); inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); ei = EXT4_I(inode); /* * Initialize owners and quota early so that we don't have to account * for quota initialization worst case in standard inode creating * transaction */ if (owner) { inode->i_mode = mode; i_uid_write(inode, owner[0]); i_gid_write(inode, owner[1]); } else if (test_opt(sb, GRPID)) { inode->i_mode = mode; inode_fsuid_set(inode, idmap); inode->i_gid = dir->i_gid; } else inode_init_owner(idmap, inode, dir, mode); if (ext4_has_feature_project(sb) && ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) ei->i_projid = EXT4_I(dir)->i_projid; else ei->i_projid = make_kprojid(&init_user_ns, EXT4_DEF_PROJID); if (!(i_flags & EXT4_EA_INODE_FL)) { err = fscrypt_prepare_new_inode(dir, inode, &encrypt); if (err) goto out; } err = dquot_initialize(inode); if (err) goto out; if (!handle && sbi->s_journal && !(i_flags & EXT4_EA_INODE_FL)) { ret2 = ext4_xattr_credits_for_new_inode(dir, mode, encrypt); if (ret2 < 0) { err = ret2; goto out; } nblocks += ret2; } if (!goal) goal = sbi->s_inode_goal; if (goal && goal <= le32_to_cpu(sbi->s_es->s_inodes_count)) { group = (goal - 1) / EXT4_INODES_PER_GROUP(sb); ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb); ret2 = 0; goto got_group; } if (S_ISDIR(mode)) ret2 = find_group_orlov(sb, dir, &group, mode, qstr); else ret2 = find_group_other(sb, dir, &group, mode); got_group: EXT4_I(dir)->i_last_alloc_group = group; err = -ENOSPC; if (ret2 == -1) goto out; /* * Normally we will only go through one pass of this loop, * unless we get unlucky and it turns out the group we selected * had its last inode grabbed by someone else. */ for (i = 0; i < ngroups; i++, ino = 0) { err = -EIO; gdp = ext4_get_group_desc(sb, group, &group_desc_bh); if (!gdp) goto out; /* * Check free inodes count before loading bitmap. */ if (ext4_free_inodes_count(sb, gdp) == 0) goto next_group; if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) { grp = ext4_get_group_info(sb, group); /* * Skip groups with already-known suspicious inode * tables */ if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) goto next_group; } brelse(inode_bitmap_bh); inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); /* Skip groups with suspicious inode tables */ if (((!(sbi->s_mount_state & EXT4_FC_REPLAY)) && EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) || IS_ERR(inode_bitmap_bh)) { inode_bitmap_bh = NULL; goto next_group; } repeat_in_this_group: ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino); if (!ret2) goto next_group; if (group == 0 && (ino + 1) < EXT4_FIRST_INO(sb)) { ext4_error(sb, "reserved inode found cleared - " "inode=%lu", ino + 1); ext4_mark_group_bitmap_corrupted(sb, group, EXT4_GROUP_INFO_IBITMAP_CORRUPT); goto next_group; } if ((!(sbi->s_mount_state & EXT4_FC_REPLAY)) && !handle) { BUG_ON(nblocks <= 0); handle = __ext4_journal_start_sb(NULL, dir->i_sb, line_no, handle_type, nblocks, 0, ext4_trans_default_revoke_credits(sb)); if (IS_ERR(handle)) { err = PTR_ERR(handle); ext4_std_error(sb, err); goto out; } } BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, inode_bitmap_bh, EXT4_JTR_NONE); if (err) { ext4_std_error(sb, err); goto out; } ext4_lock_group(sb, group); ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data); if (ret2) { /* Someone already took the bit. Repeat the search * with lock held. */ ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino); if (ret2) { ext4_set_bit(ino, inode_bitmap_bh->b_data); ret2 = 0; } else { ret2 = 1; /* we didn't grab the inode */ } } ext4_unlock_group(sb, group); ino++; /* the inode bitmap is zero-based */ if (!ret2) goto got; /* we grabbed the inode! */ if (ino < EXT4_INODES_PER_GROUP(sb)) goto repeat_in_this_group; next_group: if (++group == ngroups) group = 0; } err = -ENOSPC; goto out; got: BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh); if (err) { ext4_std_error(sb, err); goto out; } BUFFER_TRACE(group_desc_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, group_desc_bh, EXT4_JTR_NONE); if (err) { ext4_std_error(sb, err); goto out; } /* We may have to initialize the block bitmap if it isn't already */ if (ext4_has_group_desc_csum(sb) && gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { struct buffer_head *block_bitmap_bh; block_bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(block_bitmap_bh)) { err = PTR_ERR(block_bitmap_bh); goto out; } BUFFER_TRACE(block_bitmap_bh, "get block bitmap access"); err = ext4_journal_get_write_access(handle, sb, block_bitmap_bh, EXT4_JTR_NONE); if (err) { brelse(block_bitmap_bh); ext4_std_error(sb, err); goto out; } BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh); /* recheck and clear flag under lock if we still need to */ ext4_lock_group(sb, group); if (ext4_has_group_desc_csum(sb) && (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); ext4_free_group_clusters_set(sb, gdp, ext4_free_clusters_after_init(sb, group, gdp)); ext4_block_bitmap_csum_set(sb, gdp, block_bitmap_bh); ext4_group_desc_csum_set(sb, group, gdp); } ext4_unlock_group(sb, group); brelse(block_bitmap_bh); if (err) { ext4_std_error(sb, err); goto out; } } /* Update the relevant bg descriptor fields */ if (ext4_has_group_desc_csum(sb)) { int free; struct ext4_group_info *grp = NULL; if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) { grp = ext4_get_group_info(sb, group); if (!grp) { err = -EFSCORRUPTED; goto out; } down_read(&grp->alloc_sem); /* * protect vs itable * lazyinit */ } ext4_lock_group(sb, group); /* while we modify the bg desc */ free = EXT4_INODES_PER_GROUP(sb) - ext4_itable_unused_count(sb, gdp); if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); free = 0; } /* * Check the relative inode number against the last used * relative inode number in this group. if it is greater * we need to update the bg_itable_unused count */ if (ino > free) ext4_itable_unused_set(sb, gdp, (EXT4_INODES_PER_GROUP(sb) - ino)); if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) up_read(&grp->alloc_sem); } else { ext4_lock_group(sb, group); } ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1); if (S_ISDIR(mode)) { ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1); if (sbi->s_log_groups_per_flex) { ext4_group_t f = ext4_flex_group(sbi, group); atomic_inc(&sbi_array_rcu_deref(sbi, s_flex_groups, f)->used_dirs); } } if (ext4_has_group_desc_csum(sb)) { ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); ext4_group_desc_csum_set(sb, group, gdp); } ext4_unlock_group(sb, group); BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); if (err) { ext4_std_error(sb, err); goto out; } percpu_counter_dec(&sbi->s_freeinodes_counter); if (S_ISDIR(mode)) percpu_counter_inc(&sbi->s_dirs_counter); if (sbi->s_log_groups_per_flex) { flex_group = ext4_flex_group(sbi, group); atomic_dec(&sbi_array_rcu_deref(sbi, s_flex_groups, flex_group)->free_inodes); } inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); /* This is the optimal IO size (for stat), not the fs block size */ inode->i_blocks = 0; inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); ei->i_crtime = inode->i_mtime; memset(ei->i_data, 0, sizeof(ei->i_data)); ei->i_dir_start_lookup = 0; ei->i_disksize = 0; /* Don't inherit extent flag from directory, amongst others. */ ei->i_flags = ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED); ei->i_flags |= i_flags; ei->i_file_acl = 0; ei->i_dtime = 0; ei->i_block_group = group; ei->i_last_alloc_group = ~0; ext4_set_inode_flags(inode, true); if (IS_DIRSYNC(inode)) ext4_handle_sync(handle); if (insert_inode_locked(inode) < 0) { /* * Likely a bitmap corruption causing inode to be allocated * twice. */ err = -EIO; ext4_error(sb, "failed to insert inode %lu: doubly allocated?", inode->i_ino); ext4_mark_group_bitmap_corrupted(sb, group, EXT4_GROUP_INFO_IBITMAP_CORRUPT); goto out; } inode->i_generation = get_random_u32(); /* Precompute checksum seed for inode metadata */ if (ext4_has_metadata_csum(sb)) { __u32 csum; __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = cpu_to_le32(inode->i_generation); csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen)); } ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ ext4_set_inode_state(inode, EXT4_STATE_NEW); ei->i_extra_isize = sbi->s_want_extra_isize; ei->i_inline_off = 0; if (ext4_has_feature_inline_data(sb) && (!(ei->i_flags & EXT4_DAX_FL) || S_ISDIR(mode))) ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); ret = inode; err = dquot_alloc_inode(inode); if (err) goto fail_drop; /* * Since the encryption xattr will always be unique, create it first so * that it's less likely to end up in an external xattr block and * prevent its deduplication. */ if (encrypt) { err = fscrypt_set_context(inode, handle); if (err) goto fail_free_drop; } if (!(ei->i_flags & EXT4_EA_INODE_FL)) { err = ext4_init_acl(handle, inode, dir); if (err) goto fail_free_drop; err = ext4_init_security(handle, inode, dir, qstr); if (err) goto fail_free_drop; } if (ext4_has_feature_extents(sb)) { /* set extent flag only for directory, file and normal symlink*/ if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); ext4_ext_tree_init(handle, inode); } } if (ext4_handle_valid(handle)) { ei->i_sync_tid = handle->h_transaction->t_tid; ei->i_datasync_tid = handle->h_transaction->t_tid; } err = ext4_mark_inode_dirty(handle, inode); if (err) { ext4_std_error(sb, err); goto fail_free_drop; } ext4_debug("allocating inode %lu\n", inode->i_ino); trace_ext4_allocate_inode(inode, dir, mode); brelse(inode_bitmap_bh); return ret; fail_free_drop: dquot_free_inode(inode); fail_drop: clear_nlink(inode); unlock_new_inode(inode); out: dquot_drop(inode); inode->i_flags |= S_NOQUOTA; iput(inode); brelse(inode_bitmap_bh); return ERR_PTR(err); } /* Verify that we are loading a valid orphan from disk */ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) { unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count); ext4_group_t block_group; int bit; struct buffer_head *bitmap_bh = NULL; struct inode *inode = NULL; int err = -EFSCORRUPTED; if (ino < EXT4_FIRST_INO(sb) || ino > max_ino) goto bad_orphan; block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); bitmap_bh = ext4_read_inode_bitmap(sb, block_group); if (IS_ERR(bitmap_bh)) return ERR_CAST(bitmap_bh); /* Having the inode bit set should be a 100% indicator that this * is a valid orphan (no e2fsck run on fs). Orphans also include * inodes that were being truncated, so we can't check i_nlink==0. */ if (!ext4_test_bit(bit, bitmap_bh->b_data)) goto bad_orphan; inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { err = PTR_ERR(inode); ext4_error_err(sb, -err, "couldn't read orphan inode %lu (err %d)", ino, err); brelse(bitmap_bh); return inode; } /* * If the orphans has i_nlinks > 0 then it should be able to * be truncated, otherwise it won't be removed from the orphan * list during processing and an infinite loop will result. * Similarly, it must not be a bad inode. */ if ((inode->i_nlink && !ext4_can_truncate(inode)) || is_bad_inode(inode)) goto bad_orphan; if (NEXT_ORPHAN(inode) > max_ino) goto bad_orphan; brelse(bitmap_bh); return inode; bad_orphan: ext4_error(sb, "bad orphan inode %lu", ino); if (bitmap_bh) printk(KERN_ERR "ext4_test_bit(bit=%d, block=%llu) = %d\n", bit, (unsigned long long)bitmap_bh->b_blocknr, ext4_test_bit(bit, bitmap_bh->b_data)); if (inode) { printk(KERN_ERR "is_bad_inode(inode)=%d\n", is_bad_inode(inode)); printk(KERN_ERR "NEXT_ORPHAN(inode)=%u\n", NEXT_ORPHAN(inode)); printk(KERN_ERR "max_ino=%lu\n", max_ino); printk(KERN_ERR "i_nlink=%u\n", inode->i_nlink); /* Avoid freeing blocks if we got a bad deleted inode */ if (inode->i_nlink == 0) inode->i_blocks = 0; iput(inode); } brelse(bitmap_bh); return ERR_PTR(err); } unsigned long ext4_count_free_inodes(struct super_block *sb) { unsigned long desc_count; struct ext4_group_desc *gdp; ext4_group_t i, ngroups = ext4_get_groups_count(sb); #ifdef EXT4FS_DEBUG struct ext4_super_block *es; unsigned long bitmap_count, x; struct buffer_head *bitmap_bh = NULL; es = EXT4_SB(sb)->s_es; desc_count = 0; bitmap_count = 0; gdp = NULL; for (i = 0; i < ngroups; i++) { gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; desc_count += ext4_free_inodes_count(sb, gdp); brelse(bitmap_bh); bitmap_bh = ext4_read_inode_bitmap(sb, i); if (IS_ERR(bitmap_bh)) { bitmap_bh = NULL; continue; } x = ext4_count_free(bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb) / 8); printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", (unsigned long) i, ext4_free_inodes_count(sb, gdp), x); bitmap_count += x; } brelse(bitmap_bh); printk(KERN_DEBUG "ext4_count_free_inodes: " "stored = %u, computed = %lu, %lu\n", le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count); return desc_count; #else desc_count = 0; for (i = 0; i < ngroups; i++) { gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; desc_count += ext4_free_inodes_count(sb, gdp); cond_resched(); } return desc_count; #endif } /* Called at mount-time, super-block is locked */ unsigned long ext4_count_dirs(struct super_block * sb) { unsigned long count = 0; ext4_group_t i, ngroups = ext4_get_groups_count(sb); for (i = 0; i < ngroups; i++) { struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; count += ext4_used_dirs_count(sb, gdp); } return count; } /* * Zeroes not yet zeroed inode table - just write zeroes through the whole * inode table. Must be called without any spinlock held. The only place * where it is called from on active part of filesystem is ext4lazyinit * thread, so we do not need any special locks, however we have to prevent * inode allocation from the current group, so we take alloc_sem lock, to * block ext4_new_inode() until we are finished. */ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, int barrier) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_desc *gdp = NULL; struct buffer_head *group_desc_bh; handle_t *handle; ext4_fsblk_t blk; int num, ret = 0, used_blks = 0; unsigned long used_inos = 0; gdp = ext4_get_group_desc(sb, group, &group_desc_bh); if (!gdp || !grp) goto out; /* * We do not need to lock this, because we are the only one * handling this flag. */ if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)) goto out; handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; } down_write(&grp->alloc_sem); /* * If inode bitmap was already initialized there may be some * used inodes so we need to skip blocks with used inodes in * inode table. */ if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) { used_inos = EXT4_INODES_PER_GROUP(sb) - ext4_itable_unused_count(sb, gdp); used_blks = DIV_ROUND_UP(used_inos, sbi->s_inodes_per_block); /* Bogus inode unused count? */ if (used_blks < 0 || used_blks > sbi->s_itb_per_group) { ext4_error(sb, "Something is wrong with group %u: " "used itable blocks: %d; " "itable unused count: %u", group, used_blks, ext4_itable_unused_count(sb, gdp)); ret = 1; goto err_out; } used_inos += group * EXT4_INODES_PER_GROUP(sb); /* * Are there some uninitialized inodes in the inode table * before the first normal inode? */ if ((used_blks != sbi->s_itb_per_group) && (used_inos < EXT4_FIRST_INO(sb))) { ext4_error(sb, "Something is wrong with group %u: " "itable unused count: %u; " "itables initialized count: %ld", group, ext4_itable_unused_count(sb, gdp), used_inos); ret = 1; goto err_out; } } blk = ext4_inode_table(sb, gdp) + used_blks; num = sbi->s_itb_per_group - used_blks; BUFFER_TRACE(group_desc_bh, "get_write_access"); ret = ext4_journal_get_write_access(handle, sb, group_desc_bh, EXT4_JTR_NONE); if (ret) goto err_out; /* * Skip zeroout if the inode table is full. But we set the ZEROED * flag anyway, because obviously, when it is full it does not need * further zeroing. */ if (unlikely(num == 0)) goto skip_zeroout; ext4_debug("going to zero out inode table in group %d\n", group); ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS); if (ret < 0) goto err_out; if (barrier) blkdev_issue_flush(sb->s_bdev); skip_zeroout: ext4_lock_group(sb, group); gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED); ext4_group_desc_csum_set(sb, group, gdp); ext4_unlock_group(sb, group); BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); ret = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); err_out: up_write(&grp->alloc_sem); ext4_journal_stop(handle); out: return ret; } |
4 471 117 2495 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM sock #if !defined(_TRACE_SOCK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SOCK_H #include <net/sock.h> #include <net/ipv6.h> #include <linux/tracepoint.h> #include <linux/ipv6.h> #include <linux/tcp.h> #define family_names \ EM(AF_INET) \ EMe(AF_INET6) /* The protocol traced by inet_sock_set_state */ #define inet_protocol_names \ EM(IPPROTO_TCP) \ EM(IPPROTO_DCCP) \ EM(IPPROTO_SCTP) \ EMe(IPPROTO_MPTCP) #define tcp_state_names \ EM(TCP_ESTABLISHED) \ EM(TCP_SYN_SENT) \ EM(TCP_SYN_RECV) \ EM(TCP_FIN_WAIT1) \ EM(TCP_FIN_WAIT2) \ EM(TCP_TIME_WAIT) \ EM(TCP_CLOSE) \ EM(TCP_CLOSE_WAIT) \ EM(TCP_LAST_ACK) \ EM(TCP_LISTEN) \ EM(TCP_CLOSING) \ EMe(TCP_NEW_SYN_RECV) #define skmem_kind_names \ EM(SK_MEM_SEND) \ EMe(SK_MEM_RECV) /* enums need to be exported to user space */ #undef EM #undef EMe #define EM(a) TRACE_DEFINE_ENUM(a); #define EMe(a) TRACE_DEFINE_ENUM(a); family_names inet_protocol_names tcp_state_names skmem_kind_names #undef EM #undef EMe #define EM(a) { a, #a }, #define EMe(a) { a, #a } #define show_family_name(val) \ __print_symbolic(val, family_names) #define show_inet_protocol_name(val) \ __print_symbolic(val, inet_protocol_names) #define show_tcp_state_name(val) \ __print_symbolic(val, tcp_state_names) #define show_skmem_kind_names(val) \ __print_symbolic(val, skmem_kind_names) TRACE_EVENT(sock_rcvqueue_full, TP_PROTO(struct sock *sk, struct sk_buff *skb), TP_ARGS(sk, skb), TP_STRUCT__entry( __field(int, rmem_alloc) __field(unsigned int, truesize) __field(int, sk_rcvbuf) ), TP_fast_assign( __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc); __entry->truesize = skb->truesize; __entry->sk_rcvbuf = READ_ONCE(sk->sk_rcvbuf); ), TP_printk("rmem_alloc=%d truesize=%u sk_rcvbuf=%d", __entry->rmem_alloc, __entry->truesize, __entry->sk_rcvbuf) ); TRACE_EVENT(sock_exceed_buf_limit, TP_PROTO(struct sock *sk, struct proto *prot, long allocated, int kind), TP_ARGS(sk, prot, allocated, kind), TP_STRUCT__entry( __array(char, name, 32) __array(long, sysctl_mem, 3) __field(long, allocated) __field(int, sysctl_rmem) __field(int, rmem_alloc) __field(int, sysctl_wmem) __field(int, wmem_alloc) __field(int, wmem_queued) __field(int, kind) ), TP_fast_assign( strncpy(__entry->name, prot->name, 32); __entry->sysctl_mem[0] = READ_ONCE(prot->sysctl_mem[0]); __entry->sysctl_mem[1] = READ_ONCE(prot->sysctl_mem[1]); __entry->sysctl_mem[2] = READ_ONCE(prot->sysctl_mem[2]); __entry->allocated = allocated; __entry->sysctl_rmem = sk_get_rmem0(sk, prot); __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc); __entry->sysctl_wmem = sk_get_wmem0(sk, prot); __entry->wmem_alloc = refcount_read(&sk->sk_wmem_alloc); __entry->wmem_queued = READ_ONCE(sk->sk_wmem_queued); __entry->kind = kind; ), TP_printk("proto:%s sysctl_mem=%ld,%ld,%ld allocated=%ld sysctl_rmem=%d rmem_alloc=%d sysctl_wmem=%d wmem_alloc=%d wmem_queued=%d kind=%s", __entry->name, __entry->sysctl_mem[0], __entry->sysctl_mem[1], __entry->sysctl_mem[2], __entry->allocated, __entry->sysctl_rmem, __entry->rmem_alloc, __entry->sysctl_wmem, __entry->wmem_alloc, __entry->wmem_queued, show_skmem_kind_names(__entry->kind) ) ); TRACE_EVENT(inet_sock_set_state, TP_PROTO(const struct sock *sk, const int oldstate, const int newstate), TP_ARGS(sk, oldstate, newstate), TP_STRUCT__entry( __field(const void *, skaddr) __field(int, oldstate) __field(int, newstate) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __field(__u16, protocol) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) ), TP_fast_assign( const struct inet_sock *inet = inet_sk(sk); struct in6_addr *pin6; __be32 *p32; __entry->skaddr = sk; __entry->oldstate = oldstate; __entry->newstate = newstate; __entry->family = sk->sk_family; __entry->protocol = sk->sk_protocol; __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) { pin6 = (struct in6_addr *)__entry->saddr_v6; *pin6 = sk->sk_v6_rcv_saddr; pin6 = (struct in6_addr *)__entry->daddr_v6; *pin6 = sk->sk_v6_daddr; } else #endif { pin6 = (struct in6_addr *)__entry->saddr_v6; ipv6_addr_set_v4mapped(inet->inet_saddr, pin6); pin6 = (struct in6_addr *)__entry->daddr_v6; ipv6_addr_set_v4mapped(inet->inet_daddr, pin6); } ), TP_printk("family=%s protocol=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c oldstate=%s newstate=%s", show_family_name(__entry->family), show_inet_protocol_name(__entry->protocol), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, show_tcp_state_name(__entry->oldstate), show_tcp_state_name(__entry->newstate)) ); TRACE_EVENT(inet_sk_error_report, TP_PROTO(const struct sock *sk), TP_ARGS(sk), TP_STRUCT__entry( __field(int, error) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __field(__u16, protocol) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) ), TP_fast_assign( const struct inet_sock *inet = inet_sk(sk); struct in6_addr *pin6; __be32 *p32; __entry->error = sk->sk_err; __entry->family = sk->sk_family; __entry->protocol = sk->sk_protocol; __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) { pin6 = (struct in6_addr *)__entry->saddr_v6; *pin6 = sk->sk_v6_rcv_saddr; pin6 = (struct in6_addr *)__entry->daddr_v6; *pin6 = sk->sk_v6_daddr; } else #endif { pin6 = (struct in6_addr *)__entry->saddr_v6; ipv6_addr_set_v4mapped(inet->inet_saddr, pin6); pin6 = (struct in6_addr *)__entry->daddr_v6; ipv6_addr_set_v4mapped(inet->inet_daddr, pin6); } ), TP_printk("family=%s protocol=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c error=%d", show_family_name(__entry->family), show_inet_protocol_name(__entry->protocol), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, __entry->error) ); TRACE_EVENT(sk_data_ready, TP_PROTO(const struct sock *sk), TP_ARGS(sk), TP_STRUCT__entry( __field(const void *, skaddr) __field(__u16, family) __field(__u16, protocol) __field(unsigned long, ip) ), TP_fast_assign( __entry->skaddr = sk; __entry->family = sk->sk_family; __entry->protocol = sk->sk_protocol; __entry->ip = _RET_IP_; ), TP_printk("family=%u protocol=%u func=%ps", __entry->family, __entry->protocol, (void *)__entry->ip) ); /* * sock send/recv msg length */ DECLARE_EVENT_CLASS(sock_msg_length, TP_PROTO(struct sock *sk, int ret, int flags), TP_ARGS(sk, ret, flags), TP_STRUCT__entry( __field(void *, sk) __field(__u16, family) __field(__u16, protocol) __field(int, ret) __field(int, flags) ), TP_fast_assign( __entry->sk = sk; __entry->family = sk->sk_family; __entry->protocol = sk->sk_protocol; __entry->ret = ret; __entry->flags = flags; ), TP_printk("sk address = %p, family = %s protocol = %s, length = %d, error = %d, flags = 0x%x", __entry->sk, show_family_name(__entry->family), show_inet_protocol_name(__entry->protocol), !(__entry->flags & MSG_PEEK) ? (__entry->ret > 0 ? __entry->ret : 0) : 0, __entry->ret < 0 ? __entry->ret : 0, __entry->flags) ); DEFINE_EVENT(sock_msg_length, sock_send_length, TP_PROTO(struct sock *sk, int ret, int flags), TP_ARGS(sk, ret, flags) ); DEFINE_EVENT(sock_msg_length, sock_recv_length, TP_PROTO(struct sock *sk, int ret, int flags), TP_ARGS(sk, ret, flags) ); #endif /* _TRACE_SOCK_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001 Intel Corp. * * This file is part of the SCTP kernel implementation * * These are the definitions needed for the tsnmap type. The tsnmap is used * to track out of order TSNs received. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Jon Grimm <jgrimm@us.ibm.com> * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Sridhar Samudrala <sri@us.ibm.com> */ #include <net/sctp/constants.h> #ifndef __sctp_tsnmap_h__ #define __sctp_tsnmap_h__ /* RFC 2960 12.2 Parameters necessary per association (i.e. the TCB) * Mapping An array of bits or bytes indicating which out of * Array order TSN's have been received (relative to the * Last Rcvd TSN). If no gaps exist, i.e. no out of * order packets have been received, this array * will be set to all zero. This structure may be * in the form of a circular buffer or bit array. */ struct sctp_tsnmap { /* This array counts the number of chunks with each TSN. * It points at one of the two buffers with which we will * ping-pong between. */ unsigned long *tsn_map; /* This is the TSN at tsn_map[0]. */ __u32 base_tsn; /* Last Rcvd : This is the last TSN received in * TSN : sequence. This value is set initially by * : taking the peer's Initial TSN, received in * : the INIT or INIT ACK chunk, and subtracting * : one from it. * * Throughout most of the specification this is called the * "Cumulative TSN ACK Point". In this case, we * ignore the advice in 12.2 in favour of the term * used in the bulk of the text. */ __u32 cumulative_tsn_ack_point; /* This is the highest TSN we've marked. */ __u32 max_tsn_seen; /* This is the minimum number of TSNs we can track. This corresponds * to the size of tsn_map. Note: the overflow_map allows us to * potentially track more than this quantity. */ __u16 len; /* Data chunks pending receipt. used by SCTP_STATUS sockopt */ __u16 pending_data; /* Record duplicate TSNs here. We clear this after * every SACK. Store up to SCTP_MAX_DUP_TSNS worth of * information. */ __u16 num_dup_tsns; __be32 dup_tsns[SCTP_MAX_DUP_TSNS]; }; struct sctp_tsnmap_iter { __u32 start; }; /* Initialize a block of memory as a tsnmap. */ struct sctp_tsnmap *sctp_tsnmap_init(struct sctp_tsnmap *, __u16 len, __u32 initial_tsn, gfp_t gfp); void sctp_tsnmap_free(struct sctp_tsnmap *map); /* Test the tracking state of this TSN. * Returns: * 0 if the TSN has not yet been seen * >0 if the TSN has been seen (duplicate) * <0 if the TSN is invalid (too large to track) */ int sctp_tsnmap_check(const struct sctp_tsnmap *, __u32 tsn); /* Mark this TSN as seen. */ int sctp_tsnmap_mark(struct sctp_tsnmap *, __u32 tsn, struct sctp_transport *trans); /* Mark this TSN and all lower as seen. */ void sctp_tsnmap_skip(struct sctp_tsnmap *map, __u32 tsn); /* Retrieve the Cumulative TSN ACK Point. */ static inline __u32 sctp_tsnmap_get_ctsn(const struct sctp_tsnmap *map) { return map->cumulative_tsn_ack_point; } /* Retrieve the highest TSN we've seen. */ static inline __u32 sctp_tsnmap_get_max_tsn_seen(const struct sctp_tsnmap *map) { return map->max_tsn_seen; } /* How many duplicate TSNs are stored? */ static inline __u16 sctp_tsnmap_num_dups(struct sctp_tsnmap *map) { return map->num_dup_tsns; } /* Return pointer to duplicate tsn array as needed by SACK. */ static inline __be32 *sctp_tsnmap_get_dups(struct sctp_tsnmap *map) { map->num_dup_tsns = 0; return map->dup_tsns; } /* How many gap ack blocks do we have recorded? */ __u16 sctp_tsnmap_num_gabs(struct sctp_tsnmap *map, struct sctp_gap_ack_block *gabs); /* Refresh the count on pending data. */ __u16 sctp_tsnmap_pending(struct sctp_tsnmap *map); /* Is there a gap in the TSN map? */ static inline int sctp_tsnmap_has_gap(const struct sctp_tsnmap *map) { return map->cumulative_tsn_ack_point != map->max_tsn_seen; } /* Mark a duplicate TSN. Note: limit the storage of duplicate TSN * information. */ static inline void sctp_tsnmap_mark_dup(struct sctp_tsnmap *map, __u32 tsn) { if (map->num_dup_tsns < SCTP_MAX_DUP_TSNS) map->dup_tsns[map->num_dup_tsns++] = htonl(tsn); } /* Renege a TSN that was seen. */ void sctp_tsnmap_renege(struct sctp_tsnmap *, __u32 tsn); /* Is there a gap in the TSN map? */ int sctp_tsnmap_has_gap(const struct sctp_tsnmap *); #endif /* __sctp_tsnmap_h__ */ |
36 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Asynchronous Cryptographic Hash operations. * * This is the asynchronous version of hash.c with notification of * completion via a callback. * * Copyright (c) 2008 Loc Ho <lho@amcc.com> */ #include <crypto/scatterwalk.h> #include <linux/cryptouser.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/seq_file.h> #include <linux/string.h> #include <net/netlink.h> #include "hash.h" static const struct crypto_type crypto_ahash_type; struct ahash_request_priv { crypto_completion_t complete; void *data; u8 *result; u32 flags; void *ubuf[] CRYPTO_MINALIGN_ATTR; }; static int hash_walk_next(struct crypto_hash_walk *walk) { unsigned int alignmask = walk->alignmask; unsigned int offset = walk->offset; unsigned int nbytes = min(walk->entrylen, ((unsigned int)(PAGE_SIZE)) - offset); walk->data = kmap_local_page(walk->pg); walk->data += offset; if (offset & alignmask) { unsigned int unaligned = alignmask + 1 - (offset & alignmask); if (nbytes > unaligned) nbytes = unaligned; } walk->entrylen -= nbytes; return nbytes; } static int hash_walk_new_entry(struct crypto_hash_walk *walk) { struct scatterlist *sg; sg = walk->sg; walk->offset = sg->offset; walk->pg = sg_page(walk->sg) + (walk->offset >> PAGE_SHIFT); walk->offset = offset_in_page(walk->offset); walk->entrylen = sg->length; if (walk->entrylen > walk->total) walk->entrylen = walk->total; walk->total -= walk->entrylen; return hash_walk_next(walk); } int crypto_hash_walk_done(struct crypto_hash_walk *walk, int err) { unsigned int alignmask = walk->alignmask; walk->data -= walk->offset; if (walk->entrylen && (walk->offset & alignmask) && !err) { unsigned int nbytes; walk->offset = ALIGN(walk->offset, alignmask + 1); nbytes = min(walk->entrylen, (unsigned int)(PAGE_SIZE - walk->offset)); if (nbytes) { walk->entrylen -= nbytes; walk->data += walk->offset; return nbytes; } } kunmap_local(walk->data); crypto_yield(walk->flags); if (err) return err; if (walk->entrylen) { walk->offset = 0; walk->pg++; return hash_walk_next(walk); } if (!walk->total) return 0; walk->sg = sg_next(walk->sg); return hash_walk_new_entry(walk); } EXPORT_SYMBOL_GPL(crypto_hash_walk_done); int crypto_hash_walk_first(struct ahash_request *req, struct crypto_hash_walk *walk) { walk->total = req->nbytes; if (!walk->total) { walk->entrylen = 0; return 0; } walk->alignmask = crypto_ahash_alignmask(crypto_ahash_reqtfm(req)); walk->sg = req->src; walk->flags = req->base.flags; return hash_walk_new_entry(walk); } EXPORT_SYMBOL_GPL(crypto_hash_walk_first); static int ahash_setkey_unaligned(struct crypto_ahash *tfm, const u8 *key, unsigned int keylen) { unsigned long alignmask = crypto_ahash_alignmask(tfm); int ret; u8 *buffer, *alignbuffer; unsigned long absize; absize = keylen + alignmask; buffer = kmalloc(absize, GFP_KERNEL); if (!buffer) return -ENOMEM; alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1); memcpy(alignbuffer, key, keylen); ret = tfm->setkey(tfm, alignbuffer, keylen); kfree_sensitive(buffer); return ret; } static int ahash_nosetkey(struct crypto_ahash *tfm, const u8 *key, unsigned int keylen) { return -ENOSYS; } static void ahash_set_needkey(struct crypto_ahash *tfm) { const struct hash_alg_common *alg = crypto_hash_alg_common(tfm); if (tfm->setkey != ahash_nosetkey && !(alg->base.cra_flags & CRYPTO_ALG_OPTIONAL_KEY)) crypto_ahash_set_flags(tfm, CRYPTO_TFM_NEED_KEY); } int crypto_ahash_setkey(struct crypto_ahash *tfm, const u8 *key, unsigned int keylen) { unsigned long alignmask = crypto_ahash_alignmask(tfm); int err; if ((unsigned long)key & alignmask) err = ahash_setkey_unaligned(tfm, key, keylen); else err = tfm->setkey(tfm, key, keylen); if (unlikely(err)) { ahash_set_needkey(tfm); return err; } crypto_ahash_clear_flags(tfm, CRYPTO_TFM_NEED_KEY); return 0; } EXPORT_SYMBOL_GPL(crypto_ahash_setkey); static int ahash_save_req(struct ahash_request *req, crypto_completion_t cplt, bool has_state) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); unsigned long alignmask = crypto_ahash_alignmask(tfm); unsigned int ds = crypto_ahash_digestsize(tfm); struct ahash_request *subreq; unsigned int subreq_size; unsigned int reqsize; u8 *result; gfp_t gfp; u32 flags; subreq_size = sizeof(*subreq); reqsize = crypto_ahash_reqsize(tfm); reqsize = ALIGN(reqsize, crypto_tfm_ctx_alignment()); subreq_size += reqsize; subreq_size += ds; subreq_size += alignmask & ~(crypto_tfm_ctx_alignment() - 1); flags = ahash_request_flags(req); gfp = (flags & CRYPTO_TFM_REQ_MAY_SLEEP) ? GFP_KERNEL : GFP_ATOMIC; subreq = kmalloc(subreq_size, gfp); if (!subreq) return -ENOMEM; ahash_request_set_tfm(subreq, tfm); ahash_request_set_callback(subreq, flags, cplt, req); result = (u8 *)(subreq + 1) + reqsize; result = PTR_ALIGN(result, alignmask + 1); ahash_request_set_crypt(subreq, req->src, result, req->nbytes); if (has_state) { void *state; state = kmalloc(crypto_ahash_statesize(tfm), gfp); if (!state) { kfree(subreq); return -ENOMEM; } crypto_ahash_export(req, state); crypto_ahash_import(subreq, state); kfree_sensitive(state); } req->priv = subreq; return 0; } static void ahash_restore_req(struct ahash_request *req, int err) { struct ahash_request *subreq = req->priv; if (!err) memcpy(req->result, subreq->result, crypto_ahash_digestsize(crypto_ahash_reqtfm(req))); req->priv = NULL; kfree_sensitive(subreq); } static void ahash_op_unaligned_done(void *data, int err) { struct ahash_request *areq = data; if (err == -EINPROGRESS) goto out; /* First copy req->result into req->priv.result */ ahash_restore_req(areq, err); out: /* Complete the ORIGINAL request. */ ahash_request_complete(areq, err); } static int ahash_op_unaligned(struct ahash_request *req, int (*op)(struct ahash_request *), bool has_state) { int err; err = ahash_save_req(req, ahash_op_unaligned_done, has_state); if (err) return err; err = op(req->priv); if (err == -EINPROGRESS || err == -EBUSY) return err; ahash_restore_req(req, err); return err; } static int crypto_ahash_op(struct ahash_request *req, int (*op)(struct ahash_request *), bool has_state) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); unsigned long alignmask = crypto_ahash_alignmask(tfm); int err; if ((unsigned long)req->result & alignmask) err = ahash_op_unaligned(req, op, has_state); else err = op(req); return crypto_hash_errstat(crypto_hash_alg_common(tfm), err); } int crypto_ahash_final(struct ahash_request *req) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); struct hash_alg_common *alg = crypto_hash_alg_common(tfm); if (IS_ENABLED(CONFIG_CRYPTO_STATS)) atomic64_inc(&hash_get_stat(alg)->hash_cnt); return crypto_ahash_op(req, tfm->final, true); } EXPORT_SYMBOL_GPL(crypto_ahash_final); int crypto_ahash_finup(struct ahash_request *req) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); struct hash_alg_common *alg = crypto_hash_alg_common(tfm); if (IS_ENABLED(CONFIG_CRYPTO_STATS)) { struct crypto_istat_hash *istat = hash_get_stat(alg); atomic64_inc(&istat->hash_cnt); atomic64_add(req->nbytes, &istat->hash_tlen); } return crypto_ahash_op(req, tfm->finup, true); } EXPORT_SYMBOL_GPL(crypto_ahash_finup); int crypto_ahash_digest(struct ahash_request *req) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); struct hash_alg_common *alg = crypto_hash_alg_common(tfm); if (IS_ENABLED(CONFIG_CRYPTO_STATS)) { struct crypto_istat_hash *istat = hash_get_stat(alg); atomic64_inc(&istat->hash_cnt); atomic64_add(req->nbytes, &istat->hash_tlen); } if (crypto_ahash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY) return crypto_hash_errstat(alg, -ENOKEY); return crypto_ahash_op(req, tfm->digest, false); } EXPORT_SYMBOL_GPL(crypto_ahash_digest); static void ahash_def_finup_done2(void *data, int err) { struct ahash_request *areq = data; if (err == -EINPROGRESS) return; ahash_restore_req(areq, err); ahash_request_complete(areq, err); } static int ahash_def_finup_finish1(struct ahash_request *req, int err) { struct ahash_request *subreq = req->priv; if (err) goto out; subreq->base.complete = ahash_def_finup_done2; err = crypto_ahash_reqtfm(req)->final(subreq); if (err == -EINPROGRESS || err == -EBUSY) return err; out: ahash_restore_req(req, err); return err; } static void ahash_def_finup_done1(void *data, int err) { struct ahash_request *areq = data; struct ahash_request *subreq; if (err == -EINPROGRESS) goto out; subreq = areq->priv; subreq->base.flags &= CRYPTO_TFM_REQ_MAY_BACKLOG; err = ahash_def_finup_finish1(areq, err); if (err == -EINPROGRESS || err == -EBUSY) return; out: ahash_request_complete(areq, err); } static int ahash_def_finup(struct ahash_request *req) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); int err; err = ahash_save_req(req, ahash_def_finup_done1, true); if (err) return err; err = tfm->update(req->priv); if (err == -EINPROGRESS || err == -EBUSY) return err; return ahash_def_finup_finish1(req, err); } static void crypto_ahash_exit_tfm(struct crypto_tfm *tfm) { struct crypto_ahash *hash = __crypto_ahash_cast(tfm); struct ahash_alg *alg = crypto_ahash_alg(hash); alg->exit_tfm(hash); } static int crypto_ahash_init_tfm(struct crypto_tfm *tfm) { struct crypto_ahash *hash = __crypto_ahash_cast(tfm); struct ahash_alg *alg = crypto_ahash_alg(hash); hash->setkey = ahash_nosetkey; crypto_ahash_set_statesize(hash, alg->halg.statesize); if (tfm->__crt_alg->cra_type != &crypto_ahash_type) return crypto_init_shash_ops_async(tfm); hash->init = alg->init; hash->update = alg->update; hash->final = alg->final; hash->finup = alg->finup ?: ahash_def_finup; hash->digest = alg->digest; hash->export = alg->export; hash->import = alg->import; if (alg->setkey) { hash->setkey = alg->setkey; ahash_set_needkey(hash); } if (alg->exit_tfm) tfm->exit = crypto_ahash_exit_tfm; return alg->init_tfm ? alg->init_tfm(hash) : 0; } static unsigned int crypto_ahash_extsize(struct crypto_alg *alg) { if (alg->cra_type != &crypto_ahash_type) return sizeof(struct crypto_shash *); return crypto_alg_extsize(alg); } static void crypto_ahash_free_instance(struct crypto_instance *inst) { struct ahash_instance *ahash = ahash_instance(inst); ahash->free(ahash); } static int __maybe_unused crypto_ahash_report( struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_report_hash rhash; memset(&rhash, 0, sizeof(rhash)); strscpy(rhash.type, "ahash", sizeof(rhash.type)); rhash.blocksize = alg->cra_blocksize; rhash.digestsize = __crypto_hash_alg_common(alg)->digestsize; return nla_put(skb, CRYPTOCFGA_REPORT_HASH, sizeof(rhash), &rhash); } static void crypto_ahash_show(struct seq_file *m, struct crypto_alg *alg) __maybe_unused; static void crypto_ahash_show(struct seq_file *m, struct crypto_alg *alg) { seq_printf(m, "type : ahash\n"); seq_printf(m, "async : %s\n", alg->cra_flags & CRYPTO_ALG_ASYNC ? "yes" : "no"); seq_printf(m, "blocksize : %u\n", alg->cra_blocksize); seq_printf(m, "digestsize : %u\n", __crypto_hash_alg_common(alg)->digestsize); } static int __maybe_unused crypto_ahash_report_stat( struct sk_buff *skb, struct crypto_alg *alg) { return crypto_hash_report_stat(skb, alg, "ahash"); } static const struct crypto_type crypto_ahash_type = { .extsize = crypto_ahash_extsize, .init_tfm = crypto_ahash_init_tfm, .free = crypto_ahash_free_instance, #ifdef CONFIG_PROC_FS .show = crypto_ahash_show, #endif #if IS_ENABLED(CONFIG_CRYPTO_USER) .report = crypto_ahash_report, #endif #ifdef CONFIG_CRYPTO_STATS .report_stat = crypto_ahash_report_stat, #endif .maskclear = ~CRYPTO_ALG_TYPE_MASK, .maskset = CRYPTO_ALG_TYPE_AHASH_MASK, .type = CRYPTO_ALG_TYPE_AHASH, .tfmsize = offsetof(struct crypto_ahash, base), }; int crypto_grab_ahash(struct crypto_ahash_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask) { spawn->base.frontend = &crypto_ahash_type; return crypto_grab_spawn(&spawn->base, inst, name, type, mask); } EXPORT_SYMBOL_GPL(crypto_grab_ahash); struct crypto_ahash *crypto_alloc_ahash(const char *alg_name, u32 type, u32 mask) { return crypto_alloc_tfm(alg_name, &crypto_ahash_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_alloc_ahash); int crypto_has_ahash(const char *alg_name, u32 type, u32 mask) { return crypto_type_has_alg(alg_name, &crypto_ahash_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_has_ahash); struct crypto_ahash *crypto_clone_ahash(struct crypto_ahash *hash) { struct hash_alg_common *halg = crypto_hash_alg_common(hash); struct crypto_tfm *tfm = crypto_ahash_tfm(hash); struct crypto_ahash *nhash; struct ahash_alg *alg; int err; if (!crypto_hash_alg_has_setkey(halg)) { tfm = crypto_tfm_get(tfm); if (IS_ERR(tfm)) return ERR_CAST(tfm); return hash; } nhash = crypto_clone_tfm(&crypto_ahash_type, tfm); if (IS_ERR(nhash)) return nhash; nhash->init = hash->init; nhash->update = hash->update; nhash->final = hash->final; nhash->finup = hash->finup; nhash->digest = hash->digest; nhash->export = hash->export; nhash->import = hash->import; nhash->setkey = hash->setkey; nhash->reqsize = hash->reqsize; nhash->statesize = hash->statesize; if (tfm->__crt_alg->cra_type != &crypto_ahash_type) return crypto_clone_shash_ops_async(nhash, hash); err = -ENOSYS; alg = crypto_ahash_alg(hash); if (!alg->clone_tfm) goto out_free_nhash; err = alg->clone_tfm(nhash, hash); if (err) goto out_free_nhash; return nhash; out_free_nhash: crypto_free_ahash(nhash); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(crypto_clone_ahash); static int ahash_prepare_alg(struct ahash_alg *alg) { struct crypto_alg *base = &alg->halg.base; int err; if (alg->halg.statesize == 0) return -EINVAL; err = hash_prepare_alg(&alg->halg); if (err) return err; base->cra_type = &crypto_ahash_type; base->cra_flags |= CRYPTO_ALG_TYPE_AHASH; return 0; } int crypto_register_ahash(struct ahash_alg *alg) { struct crypto_alg *base = &alg->halg.base; int err; err = ahash_prepare_alg(alg); if (err) return err; return crypto_register_alg(base); } EXPORT_SYMBOL_GPL(crypto_register_ahash); void crypto_unregister_ahash(struct ahash_alg *alg) { crypto_unregister_alg(&alg->halg.base); } EXPORT_SYMBOL_GPL(crypto_unregister_ahash); int crypto_register_ahashes(struct ahash_alg *algs, int count) { int i, ret; for (i = 0; i < count; i++) { ret = crypto_register_ahash(&algs[i]); if (ret) goto err; } return 0; err: for (--i; i >= 0; --i) crypto_unregister_ahash(&algs[i]); return ret; } EXPORT_SYMBOL_GPL(crypto_register_ahashes); void crypto_unregister_ahashes(struct ahash_alg *algs, int count) { int i; for (i = count - 1; i >= 0; --i) crypto_unregister_ahash(&algs[i]); } EXPORT_SYMBOL_GPL(crypto_unregister_ahashes); int ahash_register_instance(struct crypto_template *tmpl, struct ahash_instance *inst) { int err; if (WARN_ON(!inst->free)) return -EINVAL; err = ahash_prepare_alg(&inst->alg); if (err) return err; return crypto_register_instance(tmpl, ahash_crypto_instance(inst)); } EXPORT_SYMBOL_GPL(ahash_register_instance); bool crypto_hash_alg_has_setkey(struct hash_alg_common *halg) { struct crypto_alg *alg = &halg->base; if (alg->cra_type != &crypto_ahash_type) return crypto_shash_alg_has_setkey(__crypto_shash_alg(alg)); return __crypto_ahash_alg(alg)->setkey != NULL; } EXPORT_SYMBOL_GPL(crypto_hash_alg_has_setkey); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Asynchronous cryptographic hash type"); |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_KCOV_H #define _LINUX_KCOV_H #include <linux/sched.h> #include <uapi/linux/kcov.h> struct task_struct; #ifdef CONFIG_KCOV enum kcov_mode { /* Coverage collection is not enabled yet. */ KCOV_MODE_DISABLED = 0, /* KCOV was initialized, but tracing mode hasn't been chosen yet. */ KCOV_MODE_INIT = 1, /* * Tracing coverage collection mode. * Covered PCs are collected in a per-task buffer. */ KCOV_MODE_TRACE_PC = 2, /* Collecting comparison operands mode. */ KCOV_MODE_TRACE_CMP = 3, }; #define KCOV_IN_CTXSW (1 << 30) void kcov_task_init(struct task_struct *t); void kcov_task_exit(struct task_struct *t); #define kcov_prepare_switch(t) \ do { \ (t)->kcov_mode |= KCOV_IN_CTXSW; \ } while (0) #define kcov_finish_switch(t) \ do { \ (t)->kcov_mode &= ~KCOV_IN_CTXSW; \ } while (0) /* See Documentation/dev-tools/kcov.rst for usage details. */ void kcov_remote_start(u64 handle); void kcov_remote_stop(void); u64 kcov_common_handle(void); static inline void kcov_remote_start_common(u64 id) { kcov_remote_start(kcov_remote_handle(KCOV_SUBSYSTEM_COMMON, id)); } static inline void kcov_remote_start_usb(u64 id) { kcov_remote_start(kcov_remote_handle(KCOV_SUBSYSTEM_USB, id)); } /* * The softirq flavor of kcov_remote_*() functions is introduced as a temporary * work around for kcov's lack of nested remote coverage sections support in * task context. Adding support for nested sections is tracked in: * https://bugzilla.kernel.org/show_bug.cgi?id=210337 */ static inline void kcov_remote_start_usb_softirq(u64 id) { if (in_serving_softirq()) kcov_remote_start_usb(id); } static inline void kcov_remote_stop_softirq(void) { if (in_serving_softirq()) kcov_remote_stop(); } #ifdef CONFIG_64BIT typedef unsigned long kcov_u64; #else typedef unsigned long long kcov_u64; #endif void __sanitizer_cov_trace_pc(void); void __sanitizer_cov_trace_cmp1(u8 arg1, u8 arg2); void __sanitizer_cov_trace_cmp2(u16 arg1, u16 arg2); void __sanitizer_cov_trace_cmp4(u32 arg1, u32 arg2); void __sanitizer_cov_trace_cmp8(kcov_u64 arg1, kcov_u64 arg2); void __sanitizer_cov_trace_const_cmp1(u8 arg1, u8 arg2); void __sanitizer_cov_trace_const_cmp2(u16 arg1, u16 arg2); void __sanitizer_cov_trace_const_cmp4(u32 arg1, u32 arg2); void __sanitizer_cov_trace_const_cmp8(kcov_u64 arg1, kcov_u64 arg2); void __sanitizer_cov_trace_switch(kcov_u64 val, void *cases); #else static inline void kcov_task_init(struct task_struct *t) {} static inline void kcov_task_exit(struct task_struct *t) {} static inline void kcov_prepare_switch(struct task_struct *t) {} static inline void kcov_finish_switch(struct task_struct *t) {} static inline void kcov_remote_start(u64 handle) {} static inline void kcov_remote_stop(void) {} static inline u64 kcov_common_handle(void) { return 0; } static inline void kcov_remote_start_common(u64 id) {} static inline void kcov_remote_start_usb(u64 id) {} static inline void kcov_remote_start_usb_softirq(u64 id) {} static inline void kcov_remote_stop_softirq(void) {} #endif /* CONFIG_KCOV */ #endif /* _LINUX_KCOV_H */ |
331 331 331 331 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 | // SPDX-License-Identifier: GPL-2.0 #include <linux/cache.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/pid_namespace.h> #include "internal.h" /* * /proc/self: */ static const char *proc_self_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { struct pid_namespace *ns = proc_pid_ns(inode->i_sb); pid_t tgid = task_tgid_nr_ns(current, ns); char *name; if (!tgid) return ERR_PTR(-ENOENT); /* max length of unsigned int in decimal + NULL term */ name = kmalloc(10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC); if (unlikely(!name)) return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD); sprintf(name, "%u", tgid); set_delayed_call(done, kfree_link, name); return name; } static const struct inode_operations proc_self_inode_operations = { .get_link = proc_self_get_link, }; static unsigned self_inum __ro_after_init; int proc_setup_self(struct super_block *s) { struct inode *root_inode = d_inode(s->s_root); struct proc_fs_info *fs_info = proc_sb_info(s); struct dentry *self; int ret = -ENOMEM; inode_lock(root_inode); self = d_alloc_name(s->s_root, "self"); if (self) { struct inode *inode = new_inode(s); if (inode) { inode->i_ino = self_inum; inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); inode->i_mode = S_IFLNK | S_IRWXUGO; inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; inode->i_op = &proc_self_inode_operations; d_add(self, inode); ret = 0; } else { dput(self); } } inode_unlock(root_inode); if (ret) pr_err("proc_fill_super: can't allocate /proc/self\n"); else fs_info->proc_self = self; return ret; } void __init proc_self_init(void) { proc_alloc_inum(&self_inum); } |
11581 1244 31 256 2121 4731 2120 27 256 1124 5236 1187 2988 1300 4283 9499 5705 1 47 4390 1187 1244 1269 342 25 132 269 1599 48 2337 2127 927 4365 929 929 1657 3262 798 809 3 825 46 587 231 320 20 89 7348 562 2617 558 4209 56 2029 1610 497 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Linux Security Module Hook declarations. * * Copyright (C) 2001 WireX Communications, Inc <chris@wirex.com> * Copyright (C) 2001 Greg Kroah-Hartman <greg@kroah.com> * Copyright (C) 2001 Networks Associates Technology, Inc <ssmalley@nai.com> * Copyright (C) 2001 James Morris <jmorris@intercode.com.au> * Copyright (C) 2001 Silicon Graphics, Inc. (Trust Technology Group) * Copyright (C) 2015 Intel Corporation. * Copyright (C) 2015 Casey Schaufler <casey@schaufler-ca.com> * Copyright (C) 2016 Mellanox Techonologies * Copyright (C) 2020 Google LLC. */ /* * The macro LSM_HOOK is used to define the data structures required by * the LSM framework using the pattern: * * LSM_HOOK(<return_type>, <default_value>, <hook_name>, args...) * * struct security_hook_heads { * #define LSM_HOOK(RET, DEFAULT, NAME, ...) struct hlist_head NAME; * #include <linux/lsm_hook_defs.h> * #undef LSM_HOOK * }; */ LSM_HOOK(int, 0, binder_set_context_mgr, const struct cred *mgr) LSM_HOOK(int, 0, binder_transaction, const struct cred *from, const struct cred *to) LSM_HOOK(int, 0, binder_transfer_binder, const struct cred *from, const struct cred *to) LSM_HOOK(int, 0, binder_transfer_file, const struct cred *from, const struct cred *to, const struct file *file) LSM_HOOK(int, 0, ptrace_access_check, struct task_struct *child, unsigned int mode) LSM_HOOK(int, 0, ptrace_traceme, struct task_struct *parent) LSM_HOOK(int, 0, capget, const struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted) LSM_HOOK(int, 0, capset, struct cred *new, const struct cred *old, const kernel_cap_t *effective, const kernel_cap_t *inheritable, const kernel_cap_t *permitted) LSM_HOOK(int, 0, capable, const struct cred *cred, struct user_namespace *ns, int cap, unsigned int opts) LSM_HOOK(int, 0, quotactl, int cmds, int type, int id, struct super_block *sb) LSM_HOOK(int, 0, quota_on, struct dentry *dentry) LSM_HOOK(int, 0, syslog, int type) LSM_HOOK(int, 0, settime, const struct timespec64 *ts, const struct timezone *tz) LSM_HOOK(int, 0, vm_enough_memory, struct mm_struct *mm, long pages) LSM_HOOK(int, 0, bprm_creds_for_exec, struct linux_binprm *bprm) LSM_HOOK(int, 0, bprm_creds_from_file, struct linux_binprm *bprm, struct file *file) LSM_HOOK(int, 0, bprm_check_security, struct linux_binprm *bprm) LSM_HOOK(void, LSM_RET_VOID, bprm_committing_creds, struct linux_binprm *bprm) LSM_HOOK(void, LSM_RET_VOID, bprm_committed_creds, struct linux_binprm *bprm) LSM_HOOK(int, 0, fs_context_submount, struct fs_context *fc, struct super_block *reference) LSM_HOOK(int, 0, fs_context_dup, struct fs_context *fc, struct fs_context *src_sc) LSM_HOOK(int, -ENOPARAM, fs_context_parse_param, struct fs_context *fc, struct fs_parameter *param) LSM_HOOK(int, 0, sb_alloc_security, struct super_block *sb) LSM_HOOK(void, LSM_RET_VOID, sb_delete, struct super_block *sb) LSM_HOOK(void, LSM_RET_VOID, sb_free_security, struct super_block *sb) LSM_HOOK(void, LSM_RET_VOID, sb_free_mnt_opts, void *mnt_opts) LSM_HOOK(int, 0, sb_eat_lsm_opts, char *orig, void **mnt_opts) LSM_HOOK(int, 0, sb_mnt_opts_compat, struct super_block *sb, void *mnt_opts) LSM_HOOK(int, 0, sb_remount, struct super_block *sb, void *mnt_opts) LSM_HOOK(int, 0, sb_kern_mount, struct super_block *sb) LSM_HOOK(int, 0, sb_show_options, struct seq_file *m, struct super_block *sb) LSM_HOOK(int, 0, sb_statfs, struct dentry *dentry) LSM_HOOK(int, 0, sb_mount, const char *dev_name, const struct path *path, const char *type, unsigned long flags, void *data) LSM_HOOK(int, 0, sb_umount, struct vfsmount *mnt, int flags) LSM_HOOK(int, 0, sb_pivotroot, const struct path *old_path, const struct path *new_path) LSM_HOOK(int, 0, sb_set_mnt_opts, struct super_block *sb, void *mnt_opts, unsigned long kern_flags, unsigned long *set_kern_flags) LSM_HOOK(int, 0, sb_clone_mnt_opts, const struct super_block *oldsb, struct super_block *newsb, unsigned long kern_flags, unsigned long *set_kern_flags) LSM_HOOK(int, 0, move_mount, const struct path *from_path, const struct path *to_path) LSM_HOOK(int, -EOPNOTSUPP, dentry_init_security, struct dentry *dentry, int mode, const struct qstr *name, const char **xattr_name, void **ctx, u32 *ctxlen) LSM_HOOK(int, 0, dentry_create_files_as, struct dentry *dentry, int mode, struct qstr *name, const struct cred *old, struct cred *new) #ifdef CONFIG_SECURITY_PATH LSM_HOOK(int, 0, path_unlink, const struct path *dir, struct dentry *dentry) LSM_HOOK(int, 0, path_mkdir, const struct path *dir, struct dentry *dentry, umode_t mode) LSM_HOOK(int, 0, path_rmdir, const struct path *dir, struct dentry *dentry) LSM_HOOK(int, 0, path_mknod, const struct path *dir, struct dentry *dentry, umode_t mode, unsigned int dev) LSM_HOOK(int, 0, path_truncate, const struct path *path) LSM_HOOK(int, 0, path_symlink, const struct path *dir, struct dentry *dentry, const char *old_name) LSM_HOOK(int, 0, path_link, struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry) LSM_HOOK(int, 0, path_rename, const struct path *old_dir, struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry, unsigned int flags) LSM_HOOK(int, 0, path_chmod, const struct path *path, umode_t mode) LSM_HOOK(int, 0, path_chown, const struct path *path, kuid_t uid, kgid_t gid) LSM_HOOK(int, 0, path_chroot, const struct path *path) #endif /* CONFIG_SECURITY_PATH */ /* Needed for inode based security check */ LSM_HOOK(int, 0, path_notify, const struct path *path, u64 mask, unsigned int obj_type) LSM_HOOK(int, 0, inode_alloc_security, struct inode *inode) LSM_HOOK(void, LSM_RET_VOID, inode_free_security, struct inode *inode) LSM_HOOK(int, -EOPNOTSUPP, inode_init_security, struct inode *inode, struct inode *dir, const struct qstr *qstr, struct xattr *xattrs, int *xattr_count) LSM_HOOK(int, 0, inode_init_security_anon, struct inode *inode, const struct qstr *name, const struct inode *context_inode) LSM_HOOK(int, 0, inode_create, struct inode *dir, struct dentry *dentry, umode_t mode) LSM_HOOK(int, 0, inode_link, struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry) LSM_HOOK(int, 0, inode_unlink, struct inode *dir, struct dentry *dentry) LSM_HOOK(int, 0, inode_symlink, struct inode *dir, struct dentry *dentry, const char *old_name) LSM_HOOK(int, 0, inode_mkdir, struct inode *dir, struct dentry *dentry, umode_t mode) LSM_HOOK(int, 0, inode_rmdir, struct inode *dir, struct dentry *dentry) LSM_HOOK(int, 0, inode_mknod, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) LSM_HOOK(int, 0, inode_rename, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) LSM_HOOK(int, 0, inode_readlink, struct dentry *dentry) LSM_HOOK(int, 0, inode_follow_link, struct dentry *dentry, struct inode *inode, bool rcu) LSM_HOOK(int, 0, inode_permission, struct inode *inode, int mask) LSM_HOOK(int, 0, inode_setattr, struct dentry *dentry, struct iattr *attr) LSM_HOOK(int, 0, inode_getattr, const struct path *path) LSM_HOOK(int, 0, inode_setxattr, struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) LSM_HOOK(void, LSM_RET_VOID, inode_post_setxattr, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) LSM_HOOK(int, 0, inode_getxattr, struct dentry *dentry, const char *name) LSM_HOOK(int, 0, inode_listxattr, struct dentry *dentry) LSM_HOOK(int, 0, inode_removexattr, struct mnt_idmap *idmap, struct dentry *dentry, const char *name) LSM_HOOK(int, 0, inode_set_acl, struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) LSM_HOOK(int, 0, inode_get_acl, struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) LSM_HOOK(int, 0, inode_remove_acl, struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) LSM_HOOK(int, 0, inode_need_killpriv, struct dentry *dentry) LSM_HOOK(int, 0, inode_killpriv, struct mnt_idmap *idmap, struct dentry *dentry) LSM_HOOK(int, -EOPNOTSUPP, inode_getsecurity, struct mnt_idmap *idmap, struct inode *inode, const char *name, void **buffer, bool alloc) LSM_HOOK(int, -EOPNOTSUPP, inode_setsecurity, struct inode *inode, const char *name, const void *value, size_t size, int flags) LSM_HOOK(int, 0, inode_listsecurity, struct inode *inode, char *buffer, size_t buffer_size) LSM_HOOK(void, LSM_RET_VOID, inode_getsecid, struct inode *inode, u32 *secid) LSM_HOOK(int, 0, inode_copy_up, struct dentry *src, struct cred **new) LSM_HOOK(int, -EOPNOTSUPP, inode_copy_up_xattr, const char *name) LSM_HOOK(int, 0, kernfs_init_security, struct kernfs_node *kn_dir, struct kernfs_node *kn) LSM_HOOK(int, 0, file_permission, struct file *file, int mask) LSM_HOOK(int, 0, file_alloc_security, struct file *file) LSM_HOOK(void, LSM_RET_VOID, file_free_security, struct file *file) LSM_HOOK(int, 0, file_ioctl, struct file *file, unsigned int cmd, unsigned long arg) LSM_HOOK(int, 0, mmap_addr, unsigned long addr) LSM_HOOK(int, 0, mmap_file, struct file *file, unsigned long reqprot, unsigned long prot, unsigned long flags) LSM_HOOK(int, 0, file_mprotect, struct vm_area_struct *vma, unsigned long reqprot, unsigned long prot) LSM_HOOK(int, 0, file_lock, struct file *file, unsigned int cmd) LSM_HOOK(int, 0, file_fcntl, struct file *file, unsigned int cmd, unsigned long arg) LSM_HOOK(void, LSM_RET_VOID, file_set_fowner, struct file *file) LSM_HOOK(int, 0, file_send_sigiotask, struct task_struct *tsk, struct fown_struct *fown, int sig) LSM_HOOK(int, 0, file_receive, struct file *file) LSM_HOOK(int, 0, file_open, struct file *file) LSM_HOOK(int, 0, file_truncate, struct file *file) LSM_HOOK(int, 0, task_alloc, struct task_struct *task, unsigned long clone_flags) LSM_HOOK(void, LSM_RET_VOID, task_free, struct task_struct *task) LSM_HOOK(int, 0, cred_alloc_blank, struct cred *cred, gfp_t gfp) LSM_HOOK(void, LSM_RET_VOID, cred_free, struct cred *cred) LSM_HOOK(int, 0, cred_prepare, struct cred *new, const struct cred *old, gfp_t gfp) LSM_HOOK(void, LSM_RET_VOID, cred_transfer, struct cred *new, const struct cred *old) LSM_HOOK(void, LSM_RET_VOID, cred_getsecid, const struct cred *c, u32 *secid) LSM_HOOK(int, 0, kernel_act_as, struct cred *new, u32 secid) LSM_HOOK(int, 0, kernel_create_files_as, struct cred *new, struct inode *inode) LSM_HOOK(int, 0, kernel_module_request, char *kmod_name) LSM_HOOK(int, 0, kernel_load_data, enum kernel_load_data_id id, bool contents) LSM_HOOK(int, 0, kernel_post_load_data, char *buf, loff_t size, enum kernel_load_data_id id, char *description) LSM_HOOK(int, 0, kernel_read_file, struct file *file, enum kernel_read_file_id id, bool contents) LSM_HOOK(int, 0, kernel_post_read_file, struct file *file, char *buf, loff_t size, enum kernel_read_file_id id) LSM_HOOK(int, 0, task_fix_setuid, struct cred *new, const struct cred *old, int flags) LSM_HOOK(int, 0, task_fix_setgid, struct cred *new, const struct cred * old, int flags) LSM_HOOK(int, 0, task_fix_setgroups, struct cred *new, const struct cred * old) LSM_HOOK(int, 0, task_setpgid, struct task_struct *p, pid_t pgid) LSM_HOOK(int, 0, task_getpgid, struct task_struct *p) LSM_HOOK(int, 0, task_getsid, struct task_struct *p) LSM_HOOK(void, LSM_RET_VOID, current_getsecid_subj, u32 *secid) LSM_HOOK(void, LSM_RET_VOID, task_getsecid_obj, struct task_struct *p, u32 *secid) LSM_HOOK(int, 0, task_setnice, struct task_struct *p, int nice) LSM_HOOK(int, 0, task_setioprio, struct task_struct *p, int ioprio) LSM_HOOK(int, 0, task_getioprio, struct task_struct *p) LSM_HOOK(int, 0, task_prlimit, const struct cred *cred, const struct cred *tcred, unsigned int flags) LSM_HOOK(int, 0, task_setrlimit, struct task_struct *p, unsigned int resource, struct rlimit *new_rlim) LSM_HOOK(int, 0, task_setscheduler, struct task_struct *p) LSM_HOOK(int, 0, task_getscheduler, struct task_struct *p) LSM_HOOK(int, 0, task_movememory, struct task_struct *p) LSM_HOOK(int, 0, task_kill, struct task_struct *p, struct kernel_siginfo *info, int sig, const struct cred *cred) LSM_HOOK(int, -ENOSYS, task_prctl, int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) LSM_HOOK(void, LSM_RET_VOID, task_to_inode, struct task_struct *p, struct inode *inode) LSM_HOOK(int, 0, userns_create, const struct cred *cred) LSM_HOOK(int, 0, ipc_permission, struct kern_ipc_perm *ipcp, short flag) LSM_HOOK(void, LSM_RET_VOID, ipc_getsecid, struct kern_ipc_perm *ipcp, u32 *secid) LSM_HOOK(int, 0, msg_msg_alloc_security, struct msg_msg *msg) LSM_HOOK(void, LSM_RET_VOID, msg_msg_free_security, struct msg_msg *msg) LSM_HOOK(int, 0, msg_queue_alloc_security, struct kern_ipc_perm *perm) LSM_HOOK(void, LSM_RET_VOID, msg_queue_free_security, struct kern_ipc_perm *perm) LSM_HOOK(int, 0, msg_queue_associate, struct kern_ipc_perm *perm, int msqflg) LSM_HOOK(int, 0, msg_queue_msgctl, struct kern_ipc_perm *perm, int cmd) LSM_HOOK(int, 0, msg_queue_msgsnd, struct kern_ipc_perm *perm, struct msg_msg *msg, int msqflg) LSM_HOOK(int, 0, msg_queue_msgrcv, struct kern_ipc_perm *perm, struct msg_msg *msg, struct task_struct *target, long type, int mode) LSM_HOOK(int, 0, shm_alloc_security, struct kern_ipc_perm *perm) LSM_HOOK(void, LSM_RET_VOID, shm_free_security, struct kern_ipc_perm *perm) LSM_HOOK(int, 0, shm_associate, struct kern_ipc_perm *perm, int shmflg) LSM_HOOK(int, 0, shm_shmctl, struct kern_ipc_perm *perm, int cmd) LSM_HOOK(int, 0, shm_shmat, struct kern_ipc_perm *perm, char __user *shmaddr, int shmflg) LSM_HOOK(int, 0, sem_alloc_security, struct kern_ipc_perm *perm) LSM_HOOK(void, LSM_RET_VOID, sem_free_security, struct kern_ipc_perm *perm) LSM_HOOK(int, 0, sem_associate, struct kern_ipc_perm *perm, int semflg) LSM_HOOK(int, 0, sem_semctl, struct kern_ipc_perm *perm, int cmd) LSM_HOOK(int, 0, sem_semop, struct kern_ipc_perm *perm, struct sembuf *sops, unsigned nsops, int alter) LSM_HOOK(int, 0, netlink_send, struct sock *sk, struct sk_buff *skb) LSM_HOOK(void, LSM_RET_VOID, d_instantiate, struct dentry *dentry, struct inode *inode) LSM_HOOK(int, -EINVAL, getprocattr, struct task_struct *p, const char *name, char **value) LSM_HOOK(int, -EINVAL, setprocattr, const char *name, void *value, size_t size) LSM_HOOK(int, 0, ismaclabel, const char *name) LSM_HOOK(int, -EOPNOTSUPP, secid_to_secctx, u32 secid, char **secdata, u32 *seclen) LSM_HOOK(int, 0, secctx_to_secid, const char *secdata, u32 seclen, u32 *secid) LSM_HOOK(void, LSM_RET_VOID, release_secctx, char *secdata, u32 seclen) LSM_HOOK(void, LSM_RET_VOID, inode_invalidate_secctx, struct inode *inode) LSM_HOOK(int, 0, inode_notifysecctx, struct inode *inode, void *ctx, u32 ctxlen) LSM_HOOK(int, 0, inode_setsecctx, struct dentry *dentry, void *ctx, u32 ctxlen) LSM_HOOK(int, 0, inode_getsecctx, struct inode *inode, void **ctx, u32 *ctxlen) #if defined(CONFIG_SECURITY) && defined(CONFIG_WATCH_QUEUE) LSM_HOOK(int, 0, post_notification, const struct cred *w_cred, const struct cred *cred, struct watch_notification *n) #endif /* CONFIG_SECURITY && CONFIG_WATCH_QUEUE */ #if defined(CONFIG_SECURITY) && defined(CONFIG_KEY_NOTIFICATIONS) LSM_HOOK(int, 0, watch_key, struct key *key) #endif /* CONFIG_SECURITY && CONFIG_KEY_NOTIFICATIONS */ #ifdef CONFIG_SECURITY_NETWORK LSM_HOOK(int, 0, unix_stream_connect, struct sock *sock, struct sock *other, struct sock *newsk) LSM_HOOK(int, 0, unix_may_send, struct socket *sock, struct socket *other) LSM_HOOK(int, 0, socket_create, int family, int type, int protocol, int kern) LSM_HOOK(int, 0, socket_post_create, struct socket *sock, int family, int type, int protocol, int kern) LSM_HOOK(int, 0, socket_socketpair, struct socket *socka, struct socket *sockb) LSM_HOOK(int, 0, socket_bind, struct socket *sock, struct sockaddr *address, int addrlen) LSM_HOOK(int, 0, socket_connect, struct socket *sock, struct sockaddr *address, int addrlen) LSM_HOOK(int, 0, socket_listen, struct socket *sock, int backlog) LSM_HOOK(int, 0, socket_accept, struct socket *sock, struct socket *newsock) LSM_HOOK(int, 0, socket_sendmsg, struct socket *sock, struct msghdr *msg, int size) LSM_HOOK(int, 0, socket_recvmsg, struct socket *sock, struct msghdr *msg, int size, int flags) LSM_HOOK(int, 0, socket_getsockname, struct socket *sock) LSM_HOOK(int, 0, socket_getpeername, struct socket *sock) LSM_HOOK(int, 0, socket_getsockopt, struct socket *sock, int level, int optname) LSM_HOOK(int, 0, socket_setsockopt, struct socket *sock, int level, int optname) LSM_HOOK(int, 0, socket_shutdown, struct socket *sock, int how) LSM_HOOK(int, 0, socket_sock_rcv_skb, struct sock *sk, struct sk_buff *skb) LSM_HOOK(int, 0, socket_getpeersec_stream, struct socket *sock, sockptr_t optval, sockptr_t optlen, unsigned int len) LSM_HOOK(int, 0, socket_getpeersec_dgram, struct socket *sock, struct sk_buff *skb, u32 *secid) LSM_HOOK(int, 0, sk_alloc_security, struct sock *sk, int family, gfp_t priority) LSM_HOOK(void, LSM_RET_VOID, sk_free_security, struct sock *sk) LSM_HOOK(void, LSM_RET_VOID, sk_clone_security, const struct sock *sk, struct sock *newsk) LSM_HOOK(void, LSM_RET_VOID, sk_getsecid, const struct sock *sk, u32 *secid) LSM_HOOK(void, LSM_RET_VOID, sock_graft, struct sock *sk, struct socket *parent) LSM_HOOK(int, 0, inet_conn_request, const struct sock *sk, struct sk_buff *skb, struct request_sock *req) LSM_HOOK(void, LSM_RET_VOID, inet_csk_clone, struct sock *newsk, const struct request_sock *req) LSM_HOOK(void, LSM_RET_VOID, inet_conn_established, struct sock *sk, struct sk_buff *skb) LSM_HOOK(int, 0, secmark_relabel_packet, u32 secid) LSM_HOOK(void, LSM_RET_VOID, secmark_refcount_inc, void) LSM_HOOK(void, LSM_RET_VOID, secmark_refcount_dec, void) LSM_HOOK(void, LSM_RET_VOID, req_classify_flow, const struct request_sock *req, struct flowi_common *flic) LSM_HOOK(int, 0, tun_dev_alloc_security, void **security) LSM_HOOK(void, LSM_RET_VOID, tun_dev_free_security, void *security) LSM_HOOK(int, 0, tun_dev_create, void) LSM_HOOK(int, 0, tun_dev_attach_queue, void *security) LSM_HOOK(int, 0, tun_dev_attach, struct sock *sk, void *security) LSM_HOOK(int, 0, tun_dev_open, void *security) LSM_HOOK(int, 0, sctp_assoc_request, struct sctp_association *asoc, struct sk_buff *skb) LSM_HOOK(int, 0, sctp_bind_connect, struct sock *sk, int optname, struct sockaddr *address, int addrlen) LSM_HOOK(void, LSM_RET_VOID, sctp_sk_clone, struct sctp_association *asoc, struct sock *sk, struct sock *newsk) LSM_HOOK(int, 0, sctp_assoc_established, struct sctp_association *asoc, struct sk_buff *skb) LSM_HOOK(int, 0, mptcp_add_subflow, struct sock *sk, struct sock *ssk) #endif /* CONFIG_SECURITY_NETWORK */ #ifdef CONFIG_SECURITY_INFINIBAND LSM_HOOK(int, 0, ib_pkey_access, void *sec, u64 subnet_prefix, u16 pkey) LSM_HOOK(int, 0, ib_endport_manage_subnet, void *sec, const char *dev_name, u8 port_num) LSM_HOOK(int, 0, ib_alloc_security, void **sec) LSM_HOOK(void, LSM_RET_VOID, ib_free_security, void *sec) #endif /* CONFIG_SECURITY_INFINIBAND */ #ifdef CONFIG_SECURITY_NETWORK_XFRM LSM_HOOK(int, 0, xfrm_policy_alloc_security, struct xfrm_sec_ctx **ctxp, struct xfrm_user_sec_ctx *sec_ctx, gfp_t gfp) LSM_HOOK(int, 0, xfrm_policy_clone_security, struct xfrm_sec_ctx *old_ctx, struct xfrm_sec_ctx **new_ctx) LSM_HOOK(void, LSM_RET_VOID, xfrm_policy_free_security, struct xfrm_sec_ctx *ctx) LSM_HOOK(int, 0, xfrm_policy_delete_security, struct xfrm_sec_ctx *ctx) LSM_HOOK(int, 0, xfrm_state_alloc, struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx) LSM_HOOK(int, 0, xfrm_state_alloc_acquire, struct xfrm_state *x, struct xfrm_sec_ctx *polsec, u32 secid) LSM_HOOK(void, LSM_RET_VOID, xfrm_state_free_security, struct xfrm_state *x) LSM_HOOK(int, 0, xfrm_state_delete_security, struct xfrm_state *x) LSM_HOOK(int, 0, xfrm_policy_lookup, struct xfrm_sec_ctx *ctx, u32 fl_secid) LSM_HOOK(int, 1, xfrm_state_pol_flow_match, struct xfrm_state *x, struct xfrm_policy *xp, const struct flowi_common *flic) LSM_HOOK(int, 0, xfrm_decode_session, struct sk_buff *skb, u32 *secid, int ckall) #endif /* CONFIG_SECURITY_NETWORK_XFRM */ /* key management security hooks */ #ifdef CONFIG_KEYS LSM_HOOK(int, 0, key_alloc, struct key *key, const struct cred *cred, unsigned long flags) LSM_HOOK(void, LSM_RET_VOID, key_free, struct key *key) LSM_HOOK(int, 0, key_permission, key_ref_t key_ref, const struct cred *cred, enum key_need_perm need_perm) LSM_HOOK(int, 0, key_getsecurity, struct key *key, char **buffer) #endif /* CONFIG_KEYS */ #ifdef CONFIG_AUDIT LSM_HOOK(int, 0, audit_rule_init, u32 field, u32 op, char *rulestr, void **lsmrule) LSM_HOOK(int, 0, audit_rule_known, struct audit_krule *krule) LSM_HOOK(int, 0, audit_rule_match, u32 secid, u32 field, u32 op, void *lsmrule) LSM_HOOK(void, LSM_RET_VOID, audit_rule_free, void *lsmrule) #endif /* CONFIG_AUDIT */ #ifdef CONFIG_BPF_SYSCALL LSM_HOOK(int, 0, bpf, int cmd, union bpf_attr *attr, unsigned int size) LSM_HOOK(int, 0, bpf_map, struct bpf_map *map, fmode_t fmode) LSM_HOOK(int, 0, bpf_prog, struct bpf_prog *prog) LSM_HOOK(int, 0, bpf_map_alloc_security, struct bpf_map *map) LSM_HOOK(void, LSM_RET_VOID, bpf_map_free_security, struct bpf_map *map) LSM_HOOK(int, 0, bpf_prog_alloc_security, struct bpf_prog_aux *aux) LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free_security, struct bpf_prog_aux *aux) #endif /* CONFIG_BPF_SYSCALL */ LSM_HOOK(int, 0, locked_down, enum lockdown_reason what) #ifdef CONFIG_PERF_EVENTS LSM_HOOK(int, 0, perf_event_open, struct perf_event_attr *attr, int type) LSM_HOOK(int, 0, perf_event_alloc, struct perf_event *event) LSM_HOOK(void, LSM_RET_VOID, perf_event_free, struct perf_event *event) LSM_HOOK(int, 0, perf_event_read, struct perf_event *event) LSM_HOOK(int, 0, perf_event_write, struct perf_event *event) #endif /* CONFIG_PERF_EVENTS */ #ifdef CONFIG_IO_URING LSM_HOOK(int, 0, uring_override_creds, const struct cred *new) LSM_HOOK(int, 0, uring_sqpoll, void) LSM_HOOK(int, 0, uring_cmd, struct io_uring_cmd *ioucmd) #endif /* CONFIG_IO_URING */ |
3 1 3 1 1 2 2 3 4 4 2 1 1 1 3 3 3 3 1 2 2 2 2 2 3 3 1 1 1 1 1 7 7 7 4 4 3 3 3 3 3 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 | // SPDX-License-Identifier: GPL-2.0-only /* * * Authors: * Alexander Aring <aar@pengutronix.de> * * Based on: net/wireless/nl80211.c */ #include <linux/rtnetlink.h> #include <net/cfg802154.h> #include <net/genetlink.h> #include <net/mac802154.h> #include <net/netlink.h> #include <net/nl802154.h> #include <net/sock.h> #include "nl802154.h" #include "rdev-ops.h" #include "core.h" /* the netlink family */ static struct genl_family nl802154_fam; /* multicast groups */ enum nl802154_multicast_groups { NL802154_MCGRP_CONFIG, NL802154_MCGRP_SCAN, }; static const struct genl_multicast_group nl802154_mcgrps[] = { [NL802154_MCGRP_CONFIG] = { .name = "config", }, [NL802154_MCGRP_SCAN] = { .name = "scan", }, }; /* returns ERR_PTR values */ static struct wpan_dev * __cfg802154_wpan_dev_from_attrs(struct net *netns, struct nlattr **attrs) { struct cfg802154_registered_device *rdev; struct wpan_dev *result = NULL; bool have_ifidx = attrs[NL802154_ATTR_IFINDEX]; bool have_wpan_dev_id = attrs[NL802154_ATTR_WPAN_DEV]; u64 wpan_dev_id; int wpan_phy_idx = -1; int ifidx = -1; ASSERT_RTNL(); if (!have_ifidx && !have_wpan_dev_id) return ERR_PTR(-EINVAL); if (have_ifidx) ifidx = nla_get_u32(attrs[NL802154_ATTR_IFINDEX]); if (have_wpan_dev_id) { wpan_dev_id = nla_get_u64(attrs[NL802154_ATTR_WPAN_DEV]); wpan_phy_idx = wpan_dev_id >> 32; } list_for_each_entry(rdev, &cfg802154_rdev_list, list) { struct wpan_dev *wpan_dev; if (wpan_phy_net(&rdev->wpan_phy) != netns) continue; if (have_wpan_dev_id && rdev->wpan_phy_idx != wpan_phy_idx) continue; list_for_each_entry(wpan_dev, &rdev->wpan_dev_list, list) { if (have_ifidx && wpan_dev->netdev && wpan_dev->netdev->ifindex == ifidx) { result = wpan_dev; break; } if (have_wpan_dev_id && wpan_dev->identifier == (u32)wpan_dev_id) { result = wpan_dev; break; } } if (result) break; } if (result) return result; return ERR_PTR(-ENODEV); } static struct cfg802154_registered_device * __cfg802154_rdev_from_attrs(struct net *netns, struct nlattr **attrs) { struct cfg802154_registered_device *rdev = NULL, *tmp; struct net_device *netdev; ASSERT_RTNL(); if (!attrs[NL802154_ATTR_WPAN_PHY] && !attrs[NL802154_ATTR_IFINDEX] && !attrs[NL802154_ATTR_WPAN_DEV]) return ERR_PTR(-EINVAL); if (attrs[NL802154_ATTR_WPAN_PHY]) rdev = cfg802154_rdev_by_wpan_phy_idx( nla_get_u32(attrs[NL802154_ATTR_WPAN_PHY])); if (attrs[NL802154_ATTR_WPAN_DEV]) { u64 wpan_dev_id = nla_get_u64(attrs[NL802154_ATTR_WPAN_DEV]); struct wpan_dev *wpan_dev; bool found = false; tmp = cfg802154_rdev_by_wpan_phy_idx(wpan_dev_id >> 32); if (tmp) { /* make sure wpan_dev exists */ list_for_each_entry(wpan_dev, &tmp->wpan_dev_list, list) { if (wpan_dev->identifier != (u32)wpan_dev_id) continue; found = true; break; } if (!found) tmp = NULL; if (rdev && tmp != rdev) return ERR_PTR(-EINVAL); rdev = tmp; } } if (attrs[NL802154_ATTR_IFINDEX]) { int ifindex = nla_get_u32(attrs[NL802154_ATTR_IFINDEX]); netdev = __dev_get_by_index(netns, ifindex); if (netdev) { if (netdev->ieee802154_ptr) tmp = wpan_phy_to_rdev( netdev->ieee802154_ptr->wpan_phy); else tmp = NULL; /* not wireless device -- return error */ if (!tmp) return ERR_PTR(-EINVAL); /* mismatch -- return error */ if (rdev && tmp != rdev) return ERR_PTR(-EINVAL); rdev = tmp; } } if (!rdev) return ERR_PTR(-ENODEV); if (netns != wpan_phy_net(&rdev->wpan_phy)) return ERR_PTR(-ENODEV); return rdev; } /* This function returns a pointer to the driver * that the genl_info item that is passed refers to. * * The result of this can be a PTR_ERR and hence must * be checked with IS_ERR() for errors. */ static struct cfg802154_registered_device * cfg802154_get_dev_from_info(struct net *netns, struct genl_info *info) { return __cfg802154_rdev_from_attrs(netns, info->attrs); } /* policy for the attributes */ static const struct nla_policy nl802154_policy[NL802154_ATTR_MAX+1] = { [NL802154_ATTR_WPAN_PHY] = { .type = NLA_U32 }, [NL802154_ATTR_WPAN_PHY_NAME] = { .type = NLA_NUL_STRING, .len = 20-1 }, [NL802154_ATTR_IFINDEX] = { .type = NLA_U32 }, [NL802154_ATTR_IFTYPE] = { .type = NLA_U32 }, [NL802154_ATTR_IFNAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ-1 }, [NL802154_ATTR_WPAN_DEV] = { .type = NLA_U64 }, [NL802154_ATTR_PAGE] = NLA_POLICY_MAX(NLA_U8, IEEE802154_MAX_PAGE), [NL802154_ATTR_CHANNEL] = NLA_POLICY_MAX(NLA_U8, IEEE802154_MAX_CHANNEL), [NL802154_ATTR_TX_POWER] = { .type = NLA_S32, }, [NL802154_ATTR_CCA_MODE] = { .type = NLA_U32, }, [NL802154_ATTR_CCA_OPT] = { .type = NLA_U32, }, [NL802154_ATTR_CCA_ED_LEVEL] = { .type = NLA_S32, }, [NL802154_ATTR_SUPPORTED_CHANNEL] = { .type = NLA_U32, }, [NL802154_ATTR_PAN_ID] = { .type = NLA_U16, }, [NL802154_ATTR_EXTENDED_ADDR] = { .type = NLA_U64 }, [NL802154_ATTR_SHORT_ADDR] = { .type = NLA_U16, }, [NL802154_ATTR_MIN_BE] = { .type = NLA_U8, }, [NL802154_ATTR_MAX_BE] = { .type = NLA_U8, }, [NL802154_ATTR_MAX_CSMA_BACKOFFS] = { .type = NLA_U8, }, [NL802154_ATTR_MAX_FRAME_RETRIES] = { .type = NLA_S8, }, [NL802154_ATTR_LBT_MODE] = { .type = NLA_U8, }, [NL802154_ATTR_WPAN_PHY_CAPS] = { .type = NLA_NESTED }, [NL802154_ATTR_SUPPORTED_COMMANDS] = { .type = NLA_NESTED }, [NL802154_ATTR_ACKREQ_DEFAULT] = { .type = NLA_U8 }, [NL802154_ATTR_PID] = { .type = NLA_U32 }, [NL802154_ATTR_NETNS_FD] = { .type = NLA_U32 }, [NL802154_ATTR_COORDINATOR] = { .type = NLA_NESTED }, [NL802154_ATTR_SCAN_TYPE] = NLA_POLICY_RANGE(NLA_U8, NL802154_SCAN_ED, NL802154_SCAN_RIT_PASSIVE), [NL802154_ATTR_SCAN_CHANNELS] = NLA_POLICY_MASK(NLA_U32, GENMASK(IEEE802154_MAX_CHANNEL, 0)), [NL802154_ATTR_SCAN_PREAMBLE_CODES] = { .type = NLA_REJECT }, [NL802154_ATTR_SCAN_MEAN_PRF] = { .type = NLA_REJECT }, [NL802154_ATTR_SCAN_DURATION] = NLA_POLICY_MAX(NLA_U8, IEEE802154_MAX_SCAN_DURATION), [NL802154_ATTR_SCAN_DONE_REASON] = NLA_POLICY_RANGE(NLA_U8, NL802154_SCAN_DONE_REASON_FINISHED, NL802154_SCAN_DONE_REASON_ABORTED), [NL802154_ATTR_BEACON_INTERVAL] = NLA_POLICY_MAX(NLA_U8, IEEE802154_ACTIVE_SCAN_DURATION), #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL [NL802154_ATTR_SEC_ENABLED] = { .type = NLA_U8, }, [NL802154_ATTR_SEC_OUT_LEVEL] = { .type = NLA_U32, }, [NL802154_ATTR_SEC_OUT_KEY_ID] = { .type = NLA_NESTED, }, [NL802154_ATTR_SEC_FRAME_COUNTER] = { .type = NLA_U32 }, [NL802154_ATTR_SEC_LEVEL] = { .type = NLA_NESTED }, [NL802154_ATTR_SEC_DEVICE] = { .type = NLA_NESTED }, [NL802154_ATTR_SEC_DEVKEY] = { .type = NLA_NESTED }, [NL802154_ATTR_SEC_KEY] = { .type = NLA_NESTED }, #endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ }; #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL static int nl802154_prepare_wpan_dev_dump(struct sk_buff *skb, struct netlink_callback *cb, struct cfg802154_registered_device **rdev, struct wpan_dev **wpan_dev) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); int err; rtnl_lock(); if (!cb->args[0]) { *wpan_dev = __cfg802154_wpan_dev_from_attrs(sock_net(skb->sk), info->info.attrs); if (IS_ERR(*wpan_dev)) { err = PTR_ERR(*wpan_dev); goto out_unlock; } *rdev = wpan_phy_to_rdev((*wpan_dev)->wpan_phy); /* 0 is the first index - add 1 to parse only once */ cb->args[0] = (*rdev)->wpan_phy_idx + 1; cb->args[1] = (*wpan_dev)->identifier; } else { /* subtract the 1 again here */ struct wpan_phy *wpan_phy = wpan_phy_idx_to_wpan_phy(cb->args[0] - 1); struct wpan_dev *tmp; if (!wpan_phy) { err = -ENODEV; goto out_unlock; } *rdev = wpan_phy_to_rdev(wpan_phy); *wpan_dev = NULL; list_for_each_entry(tmp, &(*rdev)->wpan_dev_list, list) { if (tmp->identifier == cb->args[1]) { *wpan_dev = tmp; break; } } if (!*wpan_dev) { err = -ENODEV; goto out_unlock; } } return 0; out_unlock: rtnl_unlock(); return err; } static void nl802154_finish_wpan_dev_dump(struct cfg802154_registered_device *rdev) { rtnl_unlock(); } #endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ /* message building helper */ static inline void *nl802154hdr_put(struct sk_buff *skb, u32 portid, u32 seq, int flags, u8 cmd) { /* since there is no private header just add the generic one */ return genlmsg_put(skb, portid, seq, &nl802154_fam, flags, cmd); } static int nl802154_put_flags(struct sk_buff *msg, int attr, u32 mask) { struct nlattr *nl_flags = nla_nest_start_noflag(msg, attr); int i; if (!nl_flags) return -ENOBUFS; i = 0; while (mask) { if ((mask & 1) && nla_put_flag(msg, i)) return -ENOBUFS; mask >>= 1; i++; } nla_nest_end(msg, nl_flags); return 0; } static int nl802154_send_wpan_phy_channels(struct cfg802154_registered_device *rdev, struct sk_buff *msg) { struct nlattr *nl_page; unsigned long page; nl_page = nla_nest_start_noflag(msg, NL802154_ATTR_CHANNELS_SUPPORTED); if (!nl_page) return -ENOBUFS; for (page = 0; page <= IEEE802154_MAX_PAGE; page++) { if (nla_put_u32(msg, NL802154_ATTR_SUPPORTED_CHANNEL, rdev->wpan_phy.supported.channels[page])) return -ENOBUFS; } nla_nest_end(msg, nl_page); return 0; } static int nl802154_put_capabilities(struct sk_buff *msg, struct cfg802154_registered_device *rdev) { const struct wpan_phy_supported *caps = &rdev->wpan_phy.supported; struct nlattr *nl_caps, *nl_channels; int i; nl_caps = nla_nest_start_noflag(msg, NL802154_ATTR_WPAN_PHY_CAPS); if (!nl_caps) return -ENOBUFS; nl_channels = nla_nest_start_noflag(msg, NL802154_CAP_ATTR_CHANNELS); if (!nl_channels) return -ENOBUFS; for (i = 0; i <= IEEE802154_MAX_PAGE; i++) { if (caps->channels[i]) { if (nl802154_put_flags(msg, i, caps->channels[i])) return -ENOBUFS; } } nla_nest_end(msg, nl_channels); if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_ED_LEVEL) { struct nlattr *nl_ed_lvls; nl_ed_lvls = nla_nest_start_noflag(msg, NL802154_CAP_ATTR_CCA_ED_LEVELS); if (!nl_ed_lvls) return -ENOBUFS; for (i = 0; i < caps->cca_ed_levels_size; i++) { if (nla_put_s32(msg, i, caps->cca_ed_levels[i])) return -ENOBUFS; } nla_nest_end(msg, nl_ed_lvls); } if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_TXPOWER) { struct nlattr *nl_tx_pwrs; nl_tx_pwrs = nla_nest_start_noflag(msg, NL802154_CAP_ATTR_TX_POWERS); if (!nl_tx_pwrs) return -ENOBUFS; for (i = 0; i < caps->tx_powers_size; i++) { if (nla_put_s32(msg, i, caps->tx_powers[i])) return -ENOBUFS; } nla_nest_end(msg, nl_tx_pwrs); } if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_MODE) { if (nl802154_put_flags(msg, NL802154_CAP_ATTR_CCA_MODES, caps->cca_modes) || nl802154_put_flags(msg, NL802154_CAP_ATTR_CCA_OPTS, caps->cca_opts)) return -ENOBUFS; } if (nla_put_u8(msg, NL802154_CAP_ATTR_MIN_MINBE, caps->min_minbe) || nla_put_u8(msg, NL802154_CAP_ATTR_MAX_MINBE, caps->max_minbe) || nla_put_u8(msg, NL802154_CAP_ATTR_MIN_MAXBE, caps->min_maxbe) || nla_put_u8(msg, NL802154_CAP_ATTR_MAX_MAXBE, caps->max_maxbe) || nla_put_u8(msg, NL802154_CAP_ATTR_MIN_CSMA_BACKOFFS, caps->min_csma_backoffs) || nla_put_u8(msg, NL802154_CAP_ATTR_MAX_CSMA_BACKOFFS, caps->max_csma_backoffs) || nla_put_s8(msg, NL802154_CAP_ATTR_MIN_FRAME_RETRIES, caps->min_frame_retries) || nla_put_s8(msg, NL802154_CAP_ATTR_MAX_FRAME_RETRIES, caps->max_frame_retries) || nl802154_put_flags(msg, NL802154_CAP_ATTR_IFTYPES, caps->iftypes) || nla_put_u32(msg, NL802154_CAP_ATTR_LBT, caps->lbt)) return -ENOBUFS; nla_nest_end(msg, nl_caps); return 0; } static int nl802154_send_wpan_phy(struct cfg802154_registered_device *rdev, enum nl802154_commands cmd, struct sk_buff *msg, u32 portid, u32 seq, int flags) { struct nlattr *nl_cmds; void *hdr; int i; hdr = nl802154hdr_put(msg, portid, seq, flags, cmd); if (!hdr) return -ENOBUFS; if (nla_put_u32(msg, NL802154_ATTR_WPAN_PHY, rdev->wpan_phy_idx) || nla_put_string(msg, NL802154_ATTR_WPAN_PHY_NAME, wpan_phy_name(&rdev->wpan_phy)) || nla_put_u32(msg, NL802154_ATTR_GENERATION, cfg802154_rdev_list_generation)) goto nla_put_failure; if (cmd != NL802154_CMD_NEW_WPAN_PHY) goto finish; /* DUMP PHY PIB */ /* current channel settings */ if (nla_put_u8(msg, NL802154_ATTR_PAGE, rdev->wpan_phy.current_page) || nla_put_u8(msg, NL802154_ATTR_CHANNEL, rdev->wpan_phy.current_channel)) goto nla_put_failure; /* TODO remove this behaviour, we still keep support it for a while * so users can change the behaviour to the new one. */ if (nl802154_send_wpan_phy_channels(rdev, msg)) goto nla_put_failure; /* cca mode */ if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_MODE) { if (nla_put_u32(msg, NL802154_ATTR_CCA_MODE, rdev->wpan_phy.cca.mode)) goto nla_put_failure; if (rdev->wpan_phy.cca.mode == NL802154_CCA_ENERGY_CARRIER) { if (nla_put_u32(msg, NL802154_ATTR_CCA_OPT, rdev->wpan_phy.cca.opt)) goto nla_put_failure; } } if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_TXPOWER) { if (nla_put_s32(msg, NL802154_ATTR_TX_POWER, rdev->wpan_phy.transmit_power)) goto nla_put_failure; } if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_ED_LEVEL) { if (nla_put_s32(msg, NL802154_ATTR_CCA_ED_LEVEL, rdev->wpan_phy.cca_ed_level)) goto nla_put_failure; } if (nl802154_put_capabilities(msg, rdev)) goto nla_put_failure; nl_cmds = nla_nest_start_noflag(msg, NL802154_ATTR_SUPPORTED_COMMANDS); if (!nl_cmds) goto nla_put_failure; i = 0; #define CMD(op, n) \ do { \ if (rdev->ops->op) { \ i++; \ if (nla_put_u32(msg, i, NL802154_CMD_ ## n)) \ goto nla_put_failure; \ } \ } while (0) CMD(add_virtual_intf, NEW_INTERFACE); CMD(del_virtual_intf, DEL_INTERFACE); CMD(set_channel, SET_CHANNEL); CMD(set_pan_id, SET_PAN_ID); CMD(set_short_addr, SET_SHORT_ADDR); CMD(set_backoff_exponent, SET_BACKOFF_EXPONENT); CMD(set_max_csma_backoffs, SET_MAX_CSMA_BACKOFFS); CMD(set_max_frame_retries, SET_MAX_FRAME_RETRIES); CMD(set_lbt_mode, SET_LBT_MODE); CMD(set_ackreq_default, SET_ACKREQ_DEFAULT); if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_TXPOWER) CMD(set_tx_power, SET_TX_POWER); if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_ED_LEVEL) CMD(set_cca_ed_level, SET_CCA_ED_LEVEL); if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_MODE) CMD(set_cca_mode, SET_CCA_MODE); #undef CMD nla_nest_end(msg, nl_cmds); finish: genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } struct nl802154_dump_wpan_phy_state { s64 filter_wpan_phy; long start; }; static int nl802154_dump_wpan_phy_parse(struct sk_buff *skb, struct netlink_callback *cb, struct nl802154_dump_wpan_phy_state *state) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct nlattr **tb = info->info.attrs; if (tb[NL802154_ATTR_WPAN_PHY]) state->filter_wpan_phy = nla_get_u32(tb[NL802154_ATTR_WPAN_PHY]); if (tb[NL802154_ATTR_WPAN_DEV]) state->filter_wpan_phy = nla_get_u64(tb[NL802154_ATTR_WPAN_DEV]) >> 32; if (tb[NL802154_ATTR_IFINDEX]) { struct net_device *netdev; struct cfg802154_registered_device *rdev; int ifidx = nla_get_u32(tb[NL802154_ATTR_IFINDEX]); netdev = __dev_get_by_index(&init_net, ifidx); if (!netdev) return -ENODEV; if (netdev->ieee802154_ptr) { rdev = wpan_phy_to_rdev( netdev->ieee802154_ptr->wpan_phy); state->filter_wpan_phy = rdev->wpan_phy_idx; } } return 0; } static int nl802154_dump_wpan_phy(struct sk_buff *skb, struct netlink_callback *cb) { int idx = 0, ret; struct nl802154_dump_wpan_phy_state *state = (void *)cb->args[0]; struct cfg802154_registered_device *rdev; rtnl_lock(); if (!state) { state = kzalloc(sizeof(*state), GFP_KERNEL); if (!state) { rtnl_unlock(); return -ENOMEM; } state->filter_wpan_phy = -1; ret = nl802154_dump_wpan_phy_parse(skb, cb, state); if (ret) { kfree(state); rtnl_unlock(); return ret; } cb->args[0] = (long)state; } list_for_each_entry(rdev, &cfg802154_rdev_list, list) { if (!net_eq(wpan_phy_net(&rdev->wpan_phy), sock_net(skb->sk))) continue; if (++idx <= state->start) continue; if (state->filter_wpan_phy != -1 && state->filter_wpan_phy != rdev->wpan_phy_idx) continue; /* attempt to fit multiple wpan_phy data chunks into the skb */ ret = nl802154_send_wpan_phy(rdev, NL802154_CMD_NEW_WPAN_PHY, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI); if (ret < 0) { if ((ret == -ENOBUFS || ret == -EMSGSIZE) && !skb->len && cb->min_dump_alloc < 4096) { cb->min_dump_alloc = 4096; rtnl_unlock(); return 1; } idx--; break; } break; } rtnl_unlock(); state->start = idx; return skb->len; } static int nl802154_dump_wpan_phy_done(struct netlink_callback *cb) { kfree((void *)cb->args[0]); return 0; } static int nl802154_get_wpan_phy(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; struct cfg802154_registered_device *rdev = info->user_ptr[0]; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; if (nl802154_send_wpan_phy(rdev, NL802154_CMD_NEW_WPAN_PHY, msg, info->snd_portid, info->snd_seq, 0) < 0) { nlmsg_free(msg); return -ENOBUFS; } return genlmsg_reply(msg, info); } static inline u64 wpan_dev_id(struct wpan_dev *wpan_dev) { return (u64)wpan_dev->identifier | ((u64)wpan_phy_to_rdev(wpan_dev->wpan_phy)->wpan_phy_idx << 32); } #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL #include <net/ieee802154_netdev.h> static int ieee802154_llsec_send_key_id(struct sk_buff *msg, const struct ieee802154_llsec_key_id *desc) { struct nlattr *nl_dev_addr; if (nla_put_u32(msg, NL802154_KEY_ID_ATTR_MODE, desc->mode)) return -ENOBUFS; switch (desc->mode) { case NL802154_KEY_ID_MODE_IMPLICIT: nl_dev_addr = nla_nest_start_noflag(msg, NL802154_KEY_ID_ATTR_IMPLICIT); if (!nl_dev_addr) return -ENOBUFS; if (nla_put_le16(msg, NL802154_DEV_ADDR_ATTR_PAN_ID, desc->device_addr.pan_id) || nla_put_u32(msg, NL802154_DEV_ADDR_ATTR_MODE, desc->device_addr.mode)) return -ENOBUFS; switch (desc->device_addr.mode) { case NL802154_DEV_ADDR_SHORT: if (nla_put_le16(msg, NL802154_DEV_ADDR_ATTR_SHORT, desc->device_addr.short_addr)) return -ENOBUFS; break; case NL802154_DEV_ADDR_EXTENDED: if (nla_put_le64(msg, NL802154_DEV_ADDR_ATTR_EXTENDED, desc->device_addr.extended_addr, NL802154_DEV_ADDR_ATTR_PAD)) return -ENOBUFS; break; default: /* userspace should handle unknown */ break; } nla_nest_end(msg, nl_dev_addr); break; case NL802154_KEY_ID_MODE_INDEX: break; case NL802154_KEY_ID_MODE_INDEX_SHORT: /* TODO renmae short_source? */ if (nla_put_le32(msg, NL802154_KEY_ID_ATTR_SOURCE_SHORT, desc->short_source)) return -ENOBUFS; break; case NL802154_KEY_ID_MODE_INDEX_EXTENDED: if (nla_put_le64(msg, NL802154_KEY_ID_ATTR_SOURCE_EXTENDED, desc->extended_source, NL802154_KEY_ID_ATTR_PAD)) return -ENOBUFS; break; default: /* userspace should handle unknown */ break; } /* TODO key_id to key_idx ? Check naming */ if (desc->mode != NL802154_KEY_ID_MODE_IMPLICIT) { if (nla_put_u8(msg, NL802154_KEY_ID_ATTR_INDEX, desc->id)) return -ENOBUFS; } return 0; } static int nl802154_get_llsec_params(struct sk_buff *msg, struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev) { struct nlattr *nl_key_id; struct ieee802154_llsec_params params; int ret; ret = rdev_get_llsec_params(rdev, wpan_dev, ¶ms); if (ret < 0) return ret; if (nla_put_u8(msg, NL802154_ATTR_SEC_ENABLED, params.enabled) || nla_put_u32(msg, NL802154_ATTR_SEC_OUT_LEVEL, params.out_level) || nla_put_be32(msg, NL802154_ATTR_SEC_FRAME_COUNTER, params.frame_counter)) return -ENOBUFS; nl_key_id = nla_nest_start_noflag(msg, NL802154_ATTR_SEC_OUT_KEY_ID); if (!nl_key_id) return -ENOBUFS; ret = ieee802154_llsec_send_key_id(msg, ¶ms.out_key); if (ret < 0) return ret; nla_nest_end(msg, nl_key_id); return 0; } #endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ static int nl802154_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flags, struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev) { struct net_device *dev = wpan_dev->netdev; void *hdr; hdr = nl802154hdr_put(msg, portid, seq, flags, NL802154_CMD_NEW_INTERFACE); if (!hdr) return -1; if (dev && (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex) || nla_put_string(msg, NL802154_ATTR_IFNAME, dev->name))) goto nla_put_failure; if (nla_put_u32(msg, NL802154_ATTR_WPAN_PHY, rdev->wpan_phy_idx) || nla_put_u32(msg, NL802154_ATTR_IFTYPE, wpan_dev->iftype) || nla_put_u64_64bit(msg, NL802154_ATTR_WPAN_DEV, wpan_dev_id(wpan_dev), NL802154_ATTR_PAD) || nla_put_u32(msg, NL802154_ATTR_GENERATION, rdev->devlist_generation ^ (cfg802154_rdev_list_generation << 2))) goto nla_put_failure; /* address settings */ if (nla_put_le64(msg, NL802154_ATTR_EXTENDED_ADDR, wpan_dev->extended_addr, NL802154_ATTR_PAD) || nla_put_le16(msg, NL802154_ATTR_SHORT_ADDR, wpan_dev->short_addr) || nla_put_le16(msg, NL802154_ATTR_PAN_ID, wpan_dev->pan_id)) goto nla_put_failure; /* ARET handling */ if (nla_put_s8(msg, NL802154_ATTR_MAX_FRAME_RETRIES, wpan_dev->frame_retries) || nla_put_u8(msg, NL802154_ATTR_MAX_BE, wpan_dev->max_be) || nla_put_u8(msg, NL802154_ATTR_MAX_CSMA_BACKOFFS, wpan_dev->csma_retries) || nla_put_u8(msg, NL802154_ATTR_MIN_BE, wpan_dev->min_be)) goto nla_put_failure; /* listen before transmit */ if (nla_put_u8(msg, NL802154_ATTR_LBT_MODE, wpan_dev->lbt)) goto nla_put_failure; /* ackreq default behaviour */ if (nla_put_u8(msg, NL802154_ATTR_ACKREQ_DEFAULT, wpan_dev->ackreq)) goto nla_put_failure; #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) goto out; if (nl802154_get_llsec_params(msg, rdev, wpan_dev) < 0) goto nla_put_failure; out: #endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int nl802154_dump_interface(struct sk_buff *skb, struct netlink_callback *cb) { int wp_idx = 0; int if_idx = 0; int wp_start = cb->args[0]; int if_start = cb->args[1]; struct cfg802154_registered_device *rdev; struct wpan_dev *wpan_dev; rtnl_lock(); list_for_each_entry(rdev, &cfg802154_rdev_list, list) { if (!net_eq(wpan_phy_net(&rdev->wpan_phy), sock_net(skb->sk))) continue; if (wp_idx < wp_start) { wp_idx++; continue; } if_idx = 0; list_for_each_entry(wpan_dev, &rdev->wpan_dev_list, list) { if (if_idx < if_start) { if_idx++; continue; } if (nl802154_send_iface(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, rdev, wpan_dev) < 0) { goto out; } if_idx++; } wp_idx++; } out: rtnl_unlock(); cb->args[0] = wp_idx; cb->args[1] = if_idx; return skb->len; } static int nl802154_get_interface(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct wpan_dev *wdev = info->user_ptr[1]; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; if (nl802154_send_iface(msg, info->snd_portid, info->snd_seq, 0, rdev, wdev) < 0) { nlmsg_free(msg); return -ENOBUFS; } return genlmsg_reply(msg, info); } static int nl802154_new_interface(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; enum nl802154_iftype type = NL802154_IFTYPE_UNSPEC; __le64 extended_addr = cpu_to_le64(0x0000000000000000ULL); /* TODO avoid failing a new interface * creation due to pending removal? */ if (!info->attrs[NL802154_ATTR_IFNAME]) return -EINVAL; if (info->attrs[NL802154_ATTR_IFTYPE]) { type = nla_get_u32(info->attrs[NL802154_ATTR_IFTYPE]); if (type > NL802154_IFTYPE_MAX || !(rdev->wpan_phy.supported.iftypes & BIT(type))) return -EINVAL; } if (info->attrs[NL802154_ATTR_EXTENDED_ADDR]) extended_addr = nla_get_le64(info->attrs[NL802154_ATTR_EXTENDED_ADDR]); if (!rdev->ops->add_virtual_intf) return -EOPNOTSUPP; return rdev_add_virtual_intf(rdev, nla_data(info->attrs[NL802154_ATTR_IFNAME]), NET_NAME_USER, type, extended_addr); } static int nl802154_del_interface(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct wpan_dev *wpan_dev = info->user_ptr[1]; if (!rdev->ops->del_virtual_intf) return -EOPNOTSUPP; /* If we remove a wpan device without a netdev then clear * user_ptr[1] so that nl802154_post_doit won't dereference it * to check if it needs to do dev_put(). Otherwise it crashes * since the wpan_dev has been freed, unlike with a netdev where * we need the dev_put() for the netdev to really be freed. */ if (!wpan_dev->netdev) info->user_ptr[1] = NULL; return rdev_del_virtual_intf(rdev, wpan_dev); } static int nl802154_set_channel(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; u8 channel, page; if (!info->attrs[NL802154_ATTR_PAGE] || !info->attrs[NL802154_ATTR_CHANNEL]) return -EINVAL; page = nla_get_u8(info->attrs[NL802154_ATTR_PAGE]); channel = nla_get_u8(info->attrs[NL802154_ATTR_CHANNEL]); /* check 802.15.4 constraints */ if (!ieee802154_chan_is_valid(&rdev->wpan_phy, page, channel)) return -EINVAL; return rdev_set_channel(rdev, page, channel); } static int nl802154_set_cca_mode(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct wpan_phy_cca cca; if (!(rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_MODE)) return -EOPNOTSUPP; if (!info->attrs[NL802154_ATTR_CCA_MODE]) return -EINVAL; cca.mode = nla_get_u32(info->attrs[NL802154_ATTR_CCA_MODE]); /* checking 802.15.4 constraints */ if (cca.mode < NL802154_CCA_ENERGY || cca.mode > NL802154_CCA_ATTR_MAX || !(rdev->wpan_phy.supported.cca_modes & BIT(cca.mode))) return -EINVAL; if (cca.mode == NL802154_CCA_ENERGY_CARRIER) { if (!info->attrs[NL802154_ATTR_CCA_OPT]) return -EINVAL; cca.opt = nla_get_u32(info->attrs[NL802154_ATTR_CCA_OPT]); if (cca.opt > NL802154_CCA_OPT_ATTR_MAX || !(rdev->wpan_phy.supported.cca_opts & BIT(cca.opt))) return -EINVAL; } return rdev_set_cca_mode(rdev, &cca); } static int nl802154_set_cca_ed_level(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; s32 ed_level; int i; if (!(rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_ED_LEVEL)) return -EOPNOTSUPP; if (!info->attrs[NL802154_ATTR_CCA_ED_LEVEL]) return -EINVAL; ed_level = nla_get_s32(info->attrs[NL802154_ATTR_CCA_ED_LEVEL]); for (i = 0; i < rdev->wpan_phy.supported.cca_ed_levels_size; i++) { if (ed_level == rdev->wpan_phy.supported.cca_ed_levels[i]) return rdev_set_cca_ed_level(rdev, ed_level); } return -EINVAL; } static int nl802154_set_tx_power(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; s32 power; int i; if (!(rdev->wpan_phy.flags & WPAN_PHY_FLAG_TXPOWER)) return -EOPNOTSUPP; if (!info->attrs[NL802154_ATTR_TX_POWER]) return -EINVAL; power = nla_get_s32(info->attrs[NL802154_ATTR_TX_POWER]); for (i = 0; i < rdev->wpan_phy.supported.tx_powers_size; i++) { if (power == rdev->wpan_phy.supported.tx_powers[i]) return rdev_set_tx_power(rdev, power); } return -EINVAL; } static int nl802154_set_pan_id(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; __le16 pan_id; /* conflict here while tx/rx calls */ if (netif_running(dev)) return -EBUSY; if (wpan_dev->lowpan_dev) { if (netif_running(wpan_dev->lowpan_dev)) return -EBUSY; } /* don't change address fields on monitor */ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR || !info->attrs[NL802154_ATTR_PAN_ID]) return -EINVAL; pan_id = nla_get_le16(info->attrs[NL802154_ATTR_PAN_ID]); /* TODO * I am not sure about to check here on broadcast pan_id. * Broadcast is a valid setting, comment from 802.15.4: * If this value is 0xffff, the device is not associated. * * This could useful to simple deassociate an device. */ if (pan_id == cpu_to_le16(IEEE802154_PAN_ID_BROADCAST)) return -EINVAL; return rdev_set_pan_id(rdev, wpan_dev, pan_id); } static int nl802154_set_short_addr(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; __le16 short_addr; /* conflict here while tx/rx calls */ if (netif_running(dev)) return -EBUSY; if (wpan_dev->lowpan_dev) { if (netif_running(wpan_dev->lowpan_dev)) return -EBUSY; } /* don't change address fields on monitor */ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR || !info->attrs[NL802154_ATTR_SHORT_ADDR]) return -EINVAL; short_addr = nla_get_le16(info->attrs[NL802154_ATTR_SHORT_ADDR]); /* TODO * I am not sure about to check here on broadcast short_addr. * Broadcast is a valid setting, comment from 802.15.4: * A value of 0xfffe indicates that the device has * associated but has not been allocated an address. A * value of 0xffff indicates that the device does not * have a short address. * * I think we should allow to set these settings but * don't allow to allow socket communication with it. */ if (short_addr == cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC) || short_addr == cpu_to_le16(IEEE802154_ADDR_SHORT_BROADCAST)) return -EINVAL; return rdev_set_short_addr(rdev, wpan_dev, short_addr); } static int nl802154_set_backoff_exponent(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; u8 min_be, max_be; /* should be set on netif open inside phy settings */ if (netif_running(dev)) return -EBUSY; if (!info->attrs[NL802154_ATTR_MIN_BE] || !info->attrs[NL802154_ATTR_MAX_BE]) return -EINVAL; min_be = nla_get_u8(info->attrs[NL802154_ATTR_MIN_BE]); max_be = nla_get_u8(info->attrs[NL802154_ATTR_MAX_BE]); /* check 802.15.4 constraints */ if (min_be < rdev->wpan_phy.supported.min_minbe || min_be > rdev->wpan_phy.supported.max_minbe || max_be < rdev->wpan_phy.supported.min_maxbe || max_be > rdev->wpan_phy.supported.max_maxbe || min_be > max_be) return -EINVAL; return rdev_set_backoff_exponent(rdev, wpan_dev, min_be, max_be); } static int nl802154_set_max_csma_backoffs(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; u8 max_csma_backoffs; /* conflict here while other running iface settings */ if (netif_running(dev)) return -EBUSY; if (!info->attrs[NL802154_ATTR_MAX_CSMA_BACKOFFS]) return -EINVAL; max_csma_backoffs = nla_get_u8( info->attrs[NL802154_ATTR_MAX_CSMA_BACKOFFS]); /* check 802.15.4 constraints */ if (max_csma_backoffs < rdev->wpan_phy.supported.min_csma_backoffs || max_csma_backoffs > rdev->wpan_phy.supported.max_csma_backoffs) return -EINVAL; return rdev_set_max_csma_backoffs(rdev, wpan_dev, max_csma_backoffs); } static int nl802154_set_max_frame_retries(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; s8 max_frame_retries; if (netif_running(dev)) return -EBUSY; if (!info->attrs[NL802154_ATTR_MAX_FRAME_RETRIES]) return -EINVAL; max_frame_retries = nla_get_s8( info->attrs[NL802154_ATTR_MAX_FRAME_RETRIES]); /* check 802.15.4 constraints */ if (max_frame_retries < rdev->wpan_phy.supported.min_frame_retries || max_frame_retries > rdev->wpan_phy.supported.max_frame_retries) return -EINVAL; return rdev_set_max_frame_retries(rdev, wpan_dev, max_frame_retries); } static int nl802154_set_lbt_mode(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; int mode; if (netif_running(dev)) return -EBUSY; if (!info->attrs[NL802154_ATTR_LBT_MODE]) return -EINVAL; mode = nla_get_u8(info->attrs[NL802154_ATTR_LBT_MODE]); if (mode != 0 && mode != 1) return -EINVAL; if (!wpan_phy_supported_bool(mode, rdev->wpan_phy.supported.lbt)) return -EINVAL; return rdev_set_lbt_mode(rdev, wpan_dev, mode); } static int nl802154_set_ackreq_default(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; int ackreq; if (netif_running(dev)) return -EBUSY; if (!info->attrs[NL802154_ATTR_ACKREQ_DEFAULT]) return -EINVAL; ackreq = nla_get_u8(info->attrs[NL802154_ATTR_ACKREQ_DEFAULT]); if (ackreq != 0 && ackreq != 1) return -EINVAL; return rdev_set_ackreq_default(rdev, wpan_dev, ackreq); } static int nl802154_wpan_phy_netns(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net *net; int err; if (info->attrs[NL802154_ATTR_PID]) { u32 pid = nla_get_u32(info->attrs[NL802154_ATTR_PID]); net = get_net_ns_by_pid(pid); } else if (info->attrs[NL802154_ATTR_NETNS_FD]) { u32 fd = nla_get_u32(info->attrs[NL802154_ATTR_NETNS_FD]); net = get_net_ns_by_fd(fd); } else { return -EINVAL; } if (IS_ERR(net)) return PTR_ERR(net); err = 0; /* check if anything to do */ if (!net_eq(wpan_phy_net(&rdev->wpan_phy), net)) err = cfg802154_switch_netns(rdev, net); put_net(net); return err; } static int nl802154_prep_scan_event_msg(struct sk_buff *msg, struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, u32 portid, u32 seq, int flags, u8 cmd, struct ieee802154_coord_desc *desc) { struct nlattr *nla; void *hdr; hdr = nl802154hdr_put(msg, portid, seq, flags, cmd); if (!hdr) return -ENOBUFS; if (nla_put_u32(msg, NL802154_ATTR_WPAN_PHY, rdev->wpan_phy_idx)) goto nla_put_failure; if (wpan_dev->netdev && nla_put_u32(msg, NL802154_ATTR_IFINDEX, wpan_dev->netdev->ifindex)) goto nla_put_failure; if (nla_put_u64_64bit(msg, NL802154_ATTR_WPAN_DEV, wpan_dev_id(wpan_dev), NL802154_ATTR_PAD)) goto nla_put_failure; nla = nla_nest_start_noflag(msg, NL802154_ATTR_COORDINATOR); if (!nla) goto nla_put_failure; if (nla_put(msg, NL802154_COORD_PANID, IEEE802154_PAN_ID_LEN, &desc->addr.pan_id)) goto nla_put_failure; if (desc->addr.mode == IEEE802154_ADDR_SHORT) { if (nla_put(msg, NL802154_COORD_ADDR, IEEE802154_SHORT_ADDR_LEN, &desc->addr.short_addr)) goto nla_put_failure; } else { if (nla_put(msg, NL802154_COORD_ADDR, IEEE802154_EXTENDED_ADDR_LEN, &desc->addr.extended_addr)) goto nla_put_failure; } if (nla_put_u8(msg, NL802154_COORD_CHANNEL, desc->channel)) goto nla_put_failure; if (nla_put_u8(msg, NL802154_COORD_PAGE, desc->page)) goto nla_put_failure; if (nla_put_u16(msg, NL802154_COORD_SUPERFRAME_SPEC, desc->superframe_spec)) goto nla_put_failure; if (nla_put_u8(msg, NL802154_COORD_LINK_QUALITY, desc->link_quality)) goto nla_put_failure; if (desc->gts_permit && nla_put_flag(msg, NL802154_COORD_GTS_PERMIT)) goto nla_put_failure; /* TODO: NL802154_COORD_PAYLOAD_DATA if any */ nla_nest_end(msg, nla); genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } int nl802154_scan_event(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, struct ieee802154_coord_desc *desc) { struct cfg802154_registered_device *rdev = wpan_phy_to_rdev(wpan_phy); struct sk_buff *msg; int ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!msg) return -ENOMEM; ret = nl802154_prep_scan_event_msg(msg, rdev, wpan_dev, 0, 0, 0, NL802154_CMD_SCAN_EVENT, desc); if (ret < 0) { nlmsg_free(msg); return ret; } return genlmsg_multicast_netns(&nl802154_fam, wpan_phy_net(wpan_phy), msg, 0, NL802154_MCGRP_SCAN, GFP_ATOMIC); } EXPORT_SYMBOL_GPL(nl802154_scan_event); static int nl802154_trigger_scan(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; struct wpan_phy *wpan_phy = &rdev->wpan_phy; struct cfg802154_scan_request *request; u8 type; int err; if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) { NL_SET_ERR_MSG(info->extack, "Monitors are not allowed to perform scans"); return -EOPNOTSUPP; } if (!info->attrs[NL802154_ATTR_SCAN_TYPE]) { NL_SET_ERR_MSG(info->extack, "Malformed request, missing scan type"); return -EINVAL; } if (wpan_phy->flags & WPAN_PHY_FLAG_DATAGRAMS_ONLY) { NL_SET_ERR_MSG(info->extack, "PHY only supports datagrams"); return -EOPNOTSUPP; } request = kzalloc(sizeof(*request), GFP_KERNEL); if (!request) return -ENOMEM; request->wpan_dev = wpan_dev; request->wpan_phy = wpan_phy; type = nla_get_u8(info->attrs[NL802154_ATTR_SCAN_TYPE]); switch (type) { case NL802154_SCAN_ACTIVE: case NL802154_SCAN_PASSIVE: request->type = type; break; default: NL_SET_ERR_MSG_FMT(info->extack, "Unsupported scan type: %d", type); err = -EINVAL; goto free_request; } /* Use current page by default */ if (info->attrs[NL802154_ATTR_PAGE]) request->page = nla_get_u8(info->attrs[NL802154_ATTR_PAGE]); else request->page = wpan_phy->current_page; /* Scan all supported channels by default */ if (info->attrs[NL802154_ATTR_SCAN_CHANNELS]) request->channels = nla_get_u32(info->attrs[NL802154_ATTR_SCAN_CHANNELS]); else request->channels = wpan_phy->supported.channels[request->page]; /* Use maximum duration order by default */ if (info->attrs[NL802154_ATTR_SCAN_DURATION]) request->duration = nla_get_u8(info->attrs[NL802154_ATTR_SCAN_DURATION]); else request->duration = IEEE802154_MAX_SCAN_DURATION; err = rdev_trigger_scan(rdev, request); if (err) { pr_err("Failure starting scanning (%d)\n", err); goto free_request; } return 0; free_request: kfree(request); return err; } static int nl802154_prep_scan_msg(struct sk_buff *msg, struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, u32 portid, u32 seq, int flags, u8 cmd, u8 arg) { void *hdr; hdr = nl802154hdr_put(msg, portid, seq, flags, cmd); if (!hdr) return -ENOBUFS; if (nla_put_u32(msg, NL802154_ATTR_WPAN_PHY, rdev->wpan_phy_idx)) goto nla_put_failure; if (wpan_dev->netdev && nla_put_u32(msg, NL802154_ATTR_IFINDEX, wpan_dev->netdev->ifindex)) goto nla_put_failure; if (nla_put_u64_64bit(msg, NL802154_ATTR_WPAN_DEV, wpan_dev_id(wpan_dev), NL802154_ATTR_PAD)) goto nla_put_failure; if (cmd == NL802154_CMD_SCAN_DONE && nla_put_u8(msg, NL802154_ATTR_SCAN_DONE_REASON, arg)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int nl802154_send_scan_msg(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, u8 cmd, u8 arg) { struct sk_buff *msg; int ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; ret = nl802154_prep_scan_msg(msg, rdev, wpan_dev, 0, 0, 0, cmd, arg); if (ret < 0) { nlmsg_free(msg); return ret; } return genlmsg_multicast_netns(&nl802154_fam, wpan_phy_net(&rdev->wpan_phy), msg, 0, NL802154_MCGRP_SCAN, GFP_KERNEL); } int nl802154_scan_started(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev) { struct cfg802154_registered_device *rdev = wpan_phy_to_rdev(wpan_phy); int err; /* Ignore errors when there are no listeners */ err = nl802154_send_scan_msg(rdev, wpan_dev, NL802154_CMD_TRIGGER_SCAN, 0); if (err == -ESRCH) err = 0; return err; } EXPORT_SYMBOL_GPL(nl802154_scan_started); int nl802154_scan_done(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, enum nl802154_scan_done_reasons reason) { struct cfg802154_registered_device *rdev = wpan_phy_to_rdev(wpan_phy); int err; /* Ignore errors when there are no listeners */ err = nl802154_send_scan_msg(rdev, wpan_dev, NL802154_CMD_SCAN_DONE, reason); if (err == -ESRCH) err = 0; return err; } EXPORT_SYMBOL_GPL(nl802154_scan_done); static int nl802154_abort_scan(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; /* Resources are released in the notification helper above */ return rdev_abort_scan(rdev, wpan_dev); } static int nl802154_send_beacons(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; struct wpan_phy *wpan_phy = &rdev->wpan_phy; struct cfg802154_beacon_request *request; int err; if (wpan_dev->iftype != NL802154_IFTYPE_COORD) { NL_SET_ERR_MSG(info->extack, "Only coordinators can send beacons"); return -EOPNOTSUPP; } if (wpan_dev->pan_id == cpu_to_le16(IEEE802154_PANID_BROADCAST)) { NL_SET_ERR_MSG(info->extack, "Device is not part of any PAN"); return -EPERM; } if (wpan_phy->flags & WPAN_PHY_FLAG_DATAGRAMS_ONLY) { NL_SET_ERR_MSG(info->extack, "PHY only supports datagrams"); return -EOPNOTSUPP; } request = kzalloc(sizeof(*request), GFP_KERNEL); if (!request) return -ENOMEM; request->wpan_dev = wpan_dev; request->wpan_phy = wpan_phy; /* Use maximum duration order by default */ if (info->attrs[NL802154_ATTR_BEACON_INTERVAL]) request->interval = nla_get_u8(info->attrs[NL802154_ATTR_BEACON_INTERVAL]); else request->interval = IEEE802154_MAX_SCAN_DURATION; err = rdev_send_beacons(rdev, request); if (err) { pr_err("Failure starting sending beacons (%d)\n", err); goto free_request; } return 0; free_request: kfree(request); return err; } void nl802154_beaconing_done(struct wpan_dev *wpan_dev) { /* NOP */ } EXPORT_SYMBOL_GPL(nl802154_beaconing_done); static int nl802154_stop_beacons(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; /* Resources are released in the notification helper above */ return rdev_stop_beacons(rdev, wpan_dev); } #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL static const struct nla_policy nl802154_dev_addr_policy[NL802154_DEV_ADDR_ATTR_MAX + 1] = { [NL802154_DEV_ADDR_ATTR_PAN_ID] = { .type = NLA_U16 }, [NL802154_DEV_ADDR_ATTR_MODE] = { .type = NLA_U32 }, [NL802154_DEV_ADDR_ATTR_SHORT] = { .type = NLA_U16 }, [NL802154_DEV_ADDR_ATTR_EXTENDED] = { .type = NLA_U64 }, }; static int ieee802154_llsec_parse_dev_addr(struct nlattr *nla, struct ieee802154_addr *addr) { struct nlattr *attrs[NL802154_DEV_ADDR_ATTR_MAX + 1]; if (!nla || nla_parse_nested_deprecated(attrs, NL802154_DEV_ADDR_ATTR_MAX, nla, nl802154_dev_addr_policy, NULL)) return -EINVAL; if (!attrs[NL802154_DEV_ADDR_ATTR_PAN_ID] || !attrs[NL802154_DEV_ADDR_ATTR_MODE]) return -EINVAL; addr->pan_id = nla_get_le16(attrs[NL802154_DEV_ADDR_ATTR_PAN_ID]); addr->mode = nla_get_u32(attrs[NL802154_DEV_ADDR_ATTR_MODE]); switch (addr->mode) { case NL802154_DEV_ADDR_SHORT: if (!attrs[NL802154_DEV_ADDR_ATTR_SHORT]) return -EINVAL; addr->short_addr = nla_get_le16(attrs[NL802154_DEV_ADDR_ATTR_SHORT]); break; case NL802154_DEV_ADDR_EXTENDED: if (!attrs[NL802154_DEV_ADDR_ATTR_EXTENDED]) return -EINVAL; addr->extended_addr = nla_get_le64(attrs[NL802154_DEV_ADDR_ATTR_EXTENDED]); break; default: return -EINVAL; } return 0; } static const struct nla_policy nl802154_key_id_policy[NL802154_KEY_ID_ATTR_MAX + 1] = { [NL802154_KEY_ID_ATTR_MODE] = { .type = NLA_U32 }, [NL802154_KEY_ID_ATTR_INDEX] = { .type = NLA_U8 }, [NL802154_KEY_ID_ATTR_IMPLICIT] = { .type = NLA_NESTED }, [NL802154_KEY_ID_ATTR_SOURCE_SHORT] = { .type = NLA_U32 }, [NL802154_KEY_ID_ATTR_SOURCE_EXTENDED] = { .type = NLA_U64 }, }; static int ieee802154_llsec_parse_key_id(struct nlattr *nla, struct ieee802154_llsec_key_id *desc) { struct nlattr *attrs[NL802154_KEY_ID_ATTR_MAX + 1]; if (!nla || nla_parse_nested_deprecated(attrs, NL802154_KEY_ID_ATTR_MAX, nla, nl802154_key_id_policy, NULL)) return -EINVAL; if (!attrs[NL802154_KEY_ID_ATTR_MODE]) return -EINVAL; desc->mode = nla_get_u32(attrs[NL802154_KEY_ID_ATTR_MODE]); switch (desc->mode) { case NL802154_KEY_ID_MODE_IMPLICIT: if (!attrs[NL802154_KEY_ID_ATTR_IMPLICIT]) return -EINVAL; if (ieee802154_llsec_parse_dev_addr(attrs[NL802154_KEY_ID_ATTR_IMPLICIT], &desc->device_addr) < 0) return -EINVAL; break; case NL802154_KEY_ID_MODE_INDEX: break; case NL802154_KEY_ID_MODE_INDEX_SHORT: if (!attrs[NL802154_KEY_ID_ATTR_SOURCE_SHORT]) return -EINVAL; desc->short_source = nla_get_le32(attrs[NL802154_KEY_ID_ATTR_SOURCE_SHORT]); break; case NL802154_KEY_ID_MODE_INDEX_EXTENDED: if (!attrs[NL802154_KEY_ID_ATTR_SOURCE_EXTENDED]) return -EINVAL; desc->extended_source = nla_get_le64(attrs[NL802154_KEY_ID_ATTR_SOURCE_EXTENDED]); break; default: return -EINVAL; } if (desc->mode != NL802154_KEY_ID_MODE_IMPLICIT) { if (!attrs[NL802154_KEY_ID_ATTR_INDEX]) return -EINVAL; /* TODO change id to idx */ desc->id = nla_get_u8(attrs[NL802154_KEY_ID_ATTR_INDEX]); } return 0; } static int nl802154_set_llsec_params(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; struct ieee802154_llsec_params params; u32 changed = 0; int ret; if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) return -EOPNOTSUPP; if (info->attrs[NL802154_ATTR_SEC_ENABLED]) { u8 enabled; enabled = nla_get_u8(info->attrs[NL802154_ATTR_SEC_ENABLED]); if (enabled != 0 && enabled != 1) return -EINVAL; params.enabled = nla_get_u8(info->attrs[NL802154_ATTR_SEC_ENABLED]); changed |= IEEE802154_LLSEC_PARAM_ENABLED; } if (info->attrs[NL802154_ATTR_SEC_OUT_KEY_ID]) { ret = ieee802154_llsec_parse_key_id(info->attrs[NL802154_ATTR_SEC_OUT_KEY_ID], ¶ms.out_key); if (ret < 0) return ret; changed |= IEEE802154_LLSEC_PARAM_OUT_KEY; } if (info->attrs[NL802154_ATTR_SEC_OUT_LEVEL]) { params.out_level = nla_get_u32(info->attrs[NL802154_ATTR_SEC_OUT_LEVEL]); if (params.out_level > NL802154_SECLEVEL_MAX) return -EINVAL; changed |= IEEE802154_LLSEC_PARAM_OUT_LEVEL; } if (info->attrs[NL802154_ATTR_SEC_FRAME_COUNTER]) { params.frame_counter = nla_get_be32(info->attrs[NL802154_ATTR_SEC_FRAME_COUNTER]); changed |= IEEE802154_LLSEC_PARAM_FRAME_COUNTER; } return rdev_set_llsec_params(rdev, wpan_dev, ¶ms, changed); } static int nl802154_send_key(struct sk_buff *msg, u32 cmd, u32 portid, u32 seq, int flags, struct cfg802154_registered_device *rdev, struct net_device *dev, const struct ieee802154_llsec_key_entry *key) { void *hdr; u32 commands[NL802154_CMD_FRAME_NR_IDS / 32]; struct nlattr *nl_key, *nl_key_id; hdr = nl802154hdr_put(msg, portid, seq, flags, cmd); if (!hdr) return -ENOBUFS; if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex)) goto nla_put_failure; nl_key = nla_nest_start_noflag(msg, NL802154_ATTR_SEC_KEY); if (!nl_key) goto nla_put_failure; nl_key_id = nla_nest_start_noflag(msg, NL802154_KEY_ATTR_ID); if (!nl_key_id) goto nla_put_failure; if (ieee802154_llsec_send_key_id(msg, &key->id) < 0) goto nla_put_failure; nla_nest_end(msg, nl_key_id); if (nla_put_u8(msg, NL802154_KEY_ATTR_USAGE_FRAMES, key->key->frame_types)) goto nla_put_failure; if (key->key->frame_types & BIT(NL802154_FRAME_CMD)) { /* TODO for each nested */ memset(commands, 0, sizeof(commands)); commands[7] = key->key->cmd_frame_ids; if (nla_put(msg, NL802154_KEY_ATTR_USAGE_CMDS, sizeof(commands), commands)) goto nla_put_failure; } if (nla_put(msg, NL802154_KEY_ATTR_BYTES, NL802154_KEY_SIZE, key->key->key)) goto nla_put_failure; nla_nest_end(msg, nl_key); genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int nl802154_dump_llsec_key(struct sk_buff *skb, struct netlink_callback *cb) { struct cfg802154_registered_device *rdev = NULL; struct ieee802154_llsec_key_entry *key; struct ieee802154_llsec_table *table; struct wpan_dev *wpan_dev; int err; err = nl802154_prepare_wpan_dev_dump(skb, cb, &rdev, &wpan_dev); if (err) return err; if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) { err = skb->len; goto out_err; } if (!wpan_dev->netdev) { err = -EINVAL; goto out_err; } rdev_lock_llsec_table(rdev, wpan_dev); rdev_get_llsec_table(rdev, wpan_dev, &table); /* TODO make it like station dump */ if (cb->args[2]) goto out; list_for_each_entry(key, &table->keys, list) { if (nl802154_send_key(skb, NL802154_CMD_NEW_SEC_KEY, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, rdev, wpan_dev->netdev, key) < 0) { /* TODO */ err = -EIO; rdev_unlock_llsec_table(rdev, wpan_dev); goto out_err; } } cb->args[2] = 1; out: rdev_unlock_llsec_table(rdev, wpan_dev); err = skb->len; out_err: nl802154_finish_wpan_dev_dump(rdev); return err; } static const struct nla_policy nl802154_key_policy[NL802154_KEY_ATTR_MAX + 1] = { [NL802154_KEY_ATTR_ID] = { NLA_NESTED }, /* TODO handle it as for_each_nested and NLA_FLAG? */ [NL802154_KEY_ATTR_USAGE_FRAMES] = { NLA_U8 }, /* TODO handle it as for_each_nested, not static array? */ [NL802154_KEY_ATTR_USAGE_CMDS] = { .len = NL802154_CMD_FRAME_NR_IDS / 8 }, [NL802154_KEY_ATTR_BYTES] = { .len = NL802154_KEY_SIZE }, }; static int nl802154_add_llsec_key(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; struct nlattr *attrs[NL802154_KEY_ATTR_MAX + 1]; struct ieee802154_llsec_key key = { }; struct ieee802154_llsec_key_id id = { }; u32 commands[NL802154_CMD_FRAME_NR_IDS / 32] = { }; if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) return -EOPNOTSUPP; if (!info->attrs[NL802154_ATTR_SEC_KEY] || nla_parse_nested_deprecated(attrs, NL802154_KEY_ATTR_MAX, info->attrs[NL802154_ATTR_SEC_KEY], nl802154_key_policy, info->extack)) return -EINVAL; if (!attrs[NL802154_KEY_ATTR_USAGE_FRAMES] || !attrs[NL802154_KEY_ATTR_BYTES]) return -EINVAL; if (ieee802154_llsec_parse_key_id(attrs[NL802154_KEY_ATTR_ID], &id) < 0) return -ENOBUFS; key.frame_types = nla_get_u8(attrs[NL802154_KEY_ATTR_USAGE_FRAMES]); if (key.frame_types > BIT(NL802154_FRAME_MAX) || ((key.frame_types & BIT(NL802154_FRAME_CMD)) && !attrs[NL802154_KEY_ATTR_USAGE_CMDS])) return -EINVAL; if (attrs[NL802154_KEY_ATTR_USAGE_CMDS]) { /* TODO for each nested */ nla_memcpy(commands, attrs[NL802154_KEY_ATTR_USAGE_CMDS], NL802154_CMD_FRAME_NR_IDS / 8); /* TODO understand the -EINVAL logic here? last condition */ if (commands[0] || commands[1] || commands[2] || commands[3] || commands[4] || commands[5] || commands[6] || commands[7] > BIT(NL802154_CMD_FRAME_MAX)) return -EINVAL; key.cmd_frame_ids = commands[7]; } else { key.cmd_frame_ids = 0; } nla_memcpy(key.key, attrs[NL802154_KEY_ATTR_BYTES], NL802154_KEY_SIZE); if (ieee802154_llsec_parse_key_id(attrs[NL802154_KEY_ATTR_ID], &id) < 0) return -ENOBUFS; return rdev_add_llsec_key(rdev, wpan_dev, &id, &key); } static int nl802154_del_llsec_key(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; struct nlattr *attrs[NL802154_KEY_ATTR_MAX + 1]; struct ieee802154_llsec_key_id id; if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) return -EOPNOTSUPP; if (!info->attrs[NL802154_ATTR_SEC_KEY] || nla_parse_nested_deprecated(attrs, NL802154_KEY_ATTR_MAX, info->attrs[NL802154_ATTR_SEC_KEY], nl802154_key_policy, info->extack)) return -EINVAL; if (ieee802154_llsec_parse_key_id(attrs[NL802154_KEY_ATTR_ID], &id) < 0) return -ENOBUFS; return rdev_del_llsec_key(rdev, wpan_dev, &id); } static int nl802154_send_device(struct sk_buff *msg, u32 cmd, u32 portid, u32 seq, int flags, struct cfg802154_registered_device *rdev, struct net_device *dev, const struct ieee802154_llsec_device *dev_desc) { void *hdr; struct nlattr *nl_device; hdr = nl802154hdr_put(msg, portid, seq, flags, cmd); if (!hdr) return -ENOBUFS; if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex)) goto nla_put_failure; nl_device = nla_nest_start_noflag(msg, NL802154_ATTR_SEC_DEVICE); if (!nl_device) goto nla_put_failure; if (nla_put_u32(msg, NL802154_DEV_ATTR_FRAME_COUNTER, dev_desc->frame_counter) || nla_put_le16(msg, NL802154_DEV_ATTR_PAN_ID, dev_desc->pan_id) || nla_put_le16(msg, NL802154_DEV_ATTR_SHORT_ADDR, dev_desc->short_addr) || nla_put_le64(msg, NL802154_DEV_ATTR_EXTENDED_ADDR, dev_desc->hwaddr, NL802154_DEV_ATTR_PAD) || nla_put_u8(msg, NL802154_DEV_ATTR_SECLEVEL_EXEMPT, dev_desc->seclevel_exempt) || nla_put_u32(msg, NL802154_DEV_ATTR_KEY_MODE, dev_desc->key_mode)) goto nla_put_failure; nla_nest_end(msg, nl_device); genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int nl802154_dump_llsec_dev(struct sk_buff *skb, struct netlink_callback *cb) { struct cfg802154_registered_device *rdev = NULL; struct ieee802154_llsec_device *dev; struct ieee802154_llsec_table *table; struct wpan_dev *wpan_dev; int err; err = nl802154_prepare_wpan_dev_dump(skb, cb, &rdev, &wpan_dev); if (err) return err; if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) { err = skb->len; goto out_err; } if (!wpan_dev->netdev) { err = -EINVAL; goto out_err; } rdev_lock_llsec_table(rdev, wpan_dev); rdev_get_llsec_table(rdev, wpan_dev, &table); /* TODO make it like station dump */ if (cb->args[2]) goto out; list_for_each_entry(dev, &table->devices, list) { if (nl802154_send_device(skb, NL802154_CMD_NEW_SEC_LEVEL, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, rdev, wpan_dev->netdev, dev) < 0) { /* TODO */ err = -EIO; rdev_unlock_llsec_table(rdev, wpan_dev); goto out_err; } } cb->args[2] = 1; out: rdev_unlock_llsec_table(rdev, wpan_dev); err = skb->len; out_err: nl802154_finish_wpan_dev_dump(rdev); return err; } static const struct nla_policy nl802154_dev_policy[NL802154_DEV_ATTR_MAX + 1] = { [NL802154_DEV_ATTR_FRAME_COUNTER] = { NLA_U32 }, [NL802154_DEV_ATTR_PAN_ID] = { .type = NLA_U16 }, [NL802154_DEV_ATTR_SHORT_ADDR] = { .type = NLA_U16 }, [NL802154_DEV_ATTR_EXTENDED_ADDR] = { .type = NLA_U64 }, [NL802154_DEV_ATTR_SECLEVEL_EXEMPT] = { NLA_U8 }, [NL802154_DEV_ATTR_KEY_MODE] = { NLA_U32 }, }; static int ieee802154_llsec_parse_device(struct nlattr *nla, struct ieee802154_llsec_device *dev) { struct nlattr *attrs[NL802154_DEV_ATTR_MAX + 1]; if (!nla || nla_parse_nested_deprecated(attrs, NL802154_DEV_ATTR_MAX, nla, nl802154_dev_policy, NULL)) return -EINVAL; memset(dev, 0, sizeof(*dev)); if (!attrs[NL802154_DEV_ATTR_FRAME_COUNTER] || !attrs[NL802154_DEV_ATTR_PAN_ID] || !attrs[NL802154_DEV_ATTR_SHORT_ADDR] || !attrs[NL802154_DEV_ATTR_EXTENDED_ADDR] || !attrs[NL802154_DEV_ATTR_SECLEVEL_EXEMPT] || !attrs[NL802154_DEV_ATTR_KEY_MODE]) return -EINVAL; /* TODO be32 */ dev->frame_counter = nla_get_u32(attrs[NL802154_DEV_ATTR_FRAME_COUNTER]); dev->pan_id = nla_get_le16(attrs[NL802154_DEV_ATTR_PAN_ID]); dev->short_addr = nla_get_le16(attrs[NL802154_DEV_ATTR_SHORT_ADDR]); /* TODO rename hwaddr to extended_addr */ dev->hwaddr = nla_get_le64(attrs[NL802154_DEV_ATTR_EXTENDED_ADDR]); dev->seclevel_exempt = nla_get_u8(attrs[NL802154_DEV_ATTR_SECLEVEL_EXEMPT]); dev->key_mode = nla_get_u32(attrs[NL802154_DEV_ATTR_KEY_MODE]); if (dev->key_mode > NL802154_DEVKEY_MAX || (dev->seclevel_exempt != 0 && dev->seclevel_exempt != 1)) return -EINVAL; return 0; } static int nl802154_add_llsec_dev(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; struct ieee802154_llsec_device dev_desc; if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) return -EOPNOTSUPP; if (ieee802154_llsec_parse_device(info->attrs[NL802154_ATTR_SEC_DEVICE], &dev_desc) < 0) return -EINVAL; return rdev_add_device(rdev, wpan_dev, &dev_desc); } static int nl802154_del_llsec_dev(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; struct nlattr *attrs[NL802154_DEV_ATTR_MAX + 1]; __le64 extended_addr; if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) return -EOPNOTSUPP; if (!info->attrs[NL802154_ATTR_SEC_DEVICE] || nla_parse_nested_deprecated(attrs, NL802154_DEV_ATTR_MAX, info->attrs[NL802154_ATTR_SEC_DEVICE], nl802154_dev_policy, info->extack)) return -EINVAL; if (!attrs[NL802154_DEV_ATTR_EXTENDED_ADDR]) return -EINVAL; extended_addr = nla_get_le64(attrs[NL802154_DEV_ATTR_EXTENDED_ADDR]); return rdev_del_device(rdev, wpan_dev, extended_addr); } static int nl802154_send_devkey(struct sk_buff *msg, u32 cmd, u32 portid, u32 seq, int flags, struct cfg802154_registered_device *rdev, struct net_device *dev, __le64 extended_addr, const struct ieee802154_llsec_device_key *devkey) { void *hdr; struct nlattr *nl_devkey, *nl_key_id; hdr = nl802154hdr_put(msg, portid, seq, flags, cmd); if (!hdr) return -ENOBUFS; if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex)) goto nla_put_failure; nl_devkey = nla_nest_start_noflag(msg, NL802154_ATTR_SEC_DEVKEY); if (!nl_devkey) goto nla_put_failure; if (nla_put_le64(msg, NL802154_DEVKEY_ATTR_EXTENDED_ADDR, extended_addr, NL802154_DEVKEY_ATTR_PAD) || nla_put_u32(msg, NL802154_DEVKEY_ATTR_FRAME_COUNTER, devkey->frame_counter)) goto nla_put_failure; nl_key_id = nla_nest_start_noflag(msg, NL802154_DEVKEY_ATTR_ID); if (!nl_key_id) goto nla_put_failure; if (ieee802154_llsec_send_key_id(msg, &devkey->key_id) < 0) goto nla_put_failure; nla_nest_end(msg, nl_key_id); nla_nest_end(msg, nl_devkey); genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int nl802154_dump_llsec_devkey(struct sk_buff *skb, struct netlink_callback *cb) { struct cfg802154_registered_device *rdev = NULL; struct ieee802154_llsec_device_key *kpos; struct ieee802154_llsec_device *dpos; struct ieee802154_llsec_table *table; struct wpan_dev *wpan_dev; int err; err = nl802154_prepare_wpan_dev_dump(skb, cb, &rdev, &wpan_dev); if (err) return err; if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) { err = skb->len; goto out_err; } if (!wpan_dev->netdev) { err = -EINVAL; goto out_err; } rdev_lock_llsec_table(rdev, wpan_dev); rdev_get_llsec_table(rdev, wpan_dev, &table); /* TODO make it like station dump */ if (cb->args[2]) goto out; /* TODO look if remove devkey and do some nested attribute */ list_for_each_entry(dpos, &table->devices, list) { list_for_each_entry(kpos, &dpos->keys, list) { if (nl802154_send_devkey(skb, NL802154_CMD_NEW_SEC_LEVEL, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, rdev, wpan_dev->netdev, dpos->hwaddr, kpos) < 0) { /* TODO */ err = -EIO; rdev_unlock_llsec_table(rdev, wpan_dev); goto out_err; } } } cb->args[2] = 1; out: rdev_unlock_llsec_table(rdev, wpan_dev); err = skb->len; out_err: nl802154_finish_wpan_dev_dump(rdev); return err; } static const struct nla_policy nl802154_devkey_policy[NL802154_DEVKEY_ATTR_MAX + 1] = { [NL802154_DEVKEY_ATTR_FRAME_COUNTER] = { NLA_U32 }, [NL802154_DEVKEY_ATTR_EXTENDED_ADDR] = { NLA_U64 }, [NL802154_DEVKEY_ATTR_ID] = { NLA_NESTED }, }; static int nl802154_add_llsec_devkey(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; struct nlattr *attrs[NL802154_DEVKEY_ATTR_MAX + 1]; struct ieee802154_llsec_device_key key; __le64 extended_addr; if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) return -EOPNOTSUPP; if (!info->attrs[NL802154_ATTR_SEC_DEVKEY] || nla_parse_nested_deprecated(attrs, NL802154_DEVKEY_ATTR_MAX, info->attrs[NL802154_ATTR_SEC_DEVKEY], nl802154_devkey_policy, info->extack) < 0) return -EINVAL; if (!attrs[NL802154_DEVKEY_ATTR_FRAME_COUNTER] || !attrs[NL802154_DEVKEY_ATTR_EXTENDED_ADDR]) return -EINVAL; /* TODO change key.id ? */ if (ieee802154_llsec_parse_key_id(attrs[NL802154_DEVKEY_ATTR_ID], &key.key_id) < 0) return -ENOBUFS; /* TODO be32 */ key.frame_counter = nla_get_u32(attrs[NL802154_DEVKEY_ATTR_FRAME_COUNTER]); /* TODO change naming hwaddr -> extended_addr * check unique identifier short+pan OR extended_addr */ extended_addr = nla_get_le64(attrs[NL802154_DEVKEY_ATTR_EXTENDED_ADDR]); return rdev_add_devkey(rdev, wpan_dev, extended_addr, &key); } static int nl802154_del_llsec_devkey(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; struct nlattr *attrs[NL802154_DEVKEY_ATTR_MAX + 1]; struct ieee802154_llsec_device_key key; __le64 extended_addr; if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) return -EOPNOTSUPP; if (!info->attrs[NL802154_ATTR_SEC_DEVKEY] || nla_parse_nested_deprecated(attrs, NL802154_DEVKEY_ATTR_MAX, info->attrs[NL802154_ATTR_SEC_DEVKEY], nl802154_devkey_policy, info->extack)) return -EINVAL; if (!attrs[NL802154_DEVKEY_ATTR_EXTENDED_ADDR]) return -EINVAL; /* TODO change key.id ? */ if (ieee802154_llsec_parse_key_id(attrs[NL802154_DEVKEY_ATTR_ID], &key.key_id) < 0) return -ENOBUFS; /* TODO change naming hwaddr -> extended_addr * check unique identifier short+pan OR extended_addr */ extended_addr = nla_get_le64(attrs[NL802154_DEVKEY_ATTR_EXTENDED_ADDR]); return rdev_del_devkey(rdev, wpan_dev, extended_addr, &key); } static int nl802154_send_seclevel(struct sk_buff *msg, u32 cmd, u32 portid, u32 seq, int flags, struct cfg802154_registered_device *rdev, struct net_device *dev, const struct ieee802154_llsec_seclevel *sl) { void *hdr; struct nlattr *nl_seclevel; hdr = nl802154hdr_put(msg, portid, seq, flags, cmd); if (!hdr) return -ENOBUFS; if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex)) goto nla_put_failure; nl_seclevel = nla_nest_start_noflag(msg, NL802154_ATTR_SEC_LEVEL); if (!nl_seclevel) goto nla_put_failure; if (nla_put_u32(msg, NL802154_SECLEVEL_ATTR_FRAME, sl->frame_type) || nla_put_u32(msg, NL802154_SECLEVEL_ATTR_LEVELS, sl->sec_levels) || nla_put_u8(msg, NL802154_SECLEVEL_ATTR_DEV_OVERRIDE, sl->device_override)) goto nla_put_failure; if (sl->frame_type == NL802154_FRAME_CMD) { if (nla_put_u32(msg, NL802154_SECLEVEL_ATTR_CMD_FRAME, sl->cmd_frame_id)) goto nla_put_failure; } nla_nest_end(msg, nl_seclevel); genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int nl802154_dump_llsec_seclevel(struct sk_buff *skb, struct netlink_callback *cb) { struct cfg802154_registered_device *rdev = NULL; struct ieee802154_llsec_seclevel *sl; struct ieee802154_llsec_table *table; struct wpan_dev *wpan_dev; int err; err = nl802154_prepare_wpan_dev_dump(skb, cb, &rdev, &wpan_dev); if (err) return err; if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) { err = skb->len; goto out_err; } if (!wpan_dev->netdev) { err = -EINVAL; goto out_err; } rdev_lock_llsec_table(rdev, wpan_dev); rdev_get_llsec_table(rdev, wpan_dev, &table); /* TODO make it like station dump */ if (cb->args[2]) goto out; list_for_each_entry(sl, &table->security_levels, list) { if (nl802154_send_seclevel(skb, NL802154_CMD_NEW_SEC_LEVEL, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, rdev, wpan_dev->netdev, sl) < 0) { /* TODO */ err = -EIO; rdev_unlock_llsec_table(rdev, wpan_dev); goto out_err; } } cb->args[2] = 1; out: rdev_unlock_llsec_table(rdev, wpan_dev); err = skb->len; out_err: nl802154_finish_wpan_dev_dump(rdev); return err; } static const struct nla_policy nl802154_seclevel_policy[NL802154_SECLEVEL_ATTR_MAX + 1] = { [NL802154_SECLEVEL_ATTR_LEVELS] = { .type = NLA_U8 }, [NL802154_SECLEVEL_ATTR_FRAME] = { .type = NLA_U32 }, [NL802154_SECLEVEL_ATTR_CMD_FRAME] = { .type = NLA_U32 }, [NL802154_SECLEVEL_ATTR_DEV_OVERRIDE] = { .type = NLA_U8 }, }; static int llsec_parse_seclevel(struct nlattr *nla, struct ieee802154_llsec_seclevel *sl) { struct nlattr *attrs[NL802154_SECLEVEL_ATTR_MAX + 1]; if (!nla || nla_parse_nested_deprecated(attrs, NL802154_SECLEVEL_ATTR_MAX, nla, nl802154_seclevel_policy, NULL)) return -EINVAL; memset(sl, 0, sizeof(*sl)); if (!attrs[NL802154_SECLEVEL_ATTR_LEVELS] || !attrs[NL802154_SECLEVEL_ATTR_FRAME] || !attrs[NL802154_SECLEVEL_ATTR_DEV_OVERRIDE]) return -EINVAL; sl->sec_levels = nla_get_u8(attrs[NL802154_SECLEVEL_ATTR_LEVELS]); sl->frame_type = nla_get_u32(attrs[NL802154_SECLEVEL_ATTR_FRAME]); sl->device_override = nla_get_u8(attrs[NL802154_SECLEVEL_ATTR_DEV_OVERRIDE]); if (sl->frame_type > NL802154_FRAME_MAX || (sl->device_override != 0 && sl->device_override != 1)) return -EINVAL; if (sl->frame_type == NL802154_FRAME_CMD) { if (!attrs[NL802154_SECLEVEL_ATTR_CMD_FRAME]) return -EINVAL; sl->cmd_frame_id = nla_get_u32(attrs[NL802154_SECLEVEL_ATTR_CMD_FRAME]); if (sl->cmd_frame_id > NL802154_CMD_FRAME_MAX) return -EINVAL; } return 0; } static int nl802154_add_llsec_seclevel(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; struct ieee802154_llsec_seclevel sl; if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) return -EOPNOTSUPP; if (llsec_parse_seclevel(info->attrs[NL802154_ATTR_SEC_LEVEL], &sl) < 0) return -EINVAL; return rdev_add_seclevel(rdev, wpan_dev, &sl); } static int nl802154_del_llsec_seclevel(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; struct ieee802154_llsec_seclevel sl; if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) return -EOPNOTSUPP; if (llsec_parse_seclevel(info->attrs[NL802154_ATTR_SEC_LEVEL], &sl) < 0) return -EINVAL; return rdev_del_seclevel(rdev, wpan_dev, &sl); } #endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ #define NL802154_FLAG_NEED_WPAN_PHY 0x01 #define NL802154_FLAG_NEED_NETDEV 0x02 #define NL802154_FLAG_NEED_RTNL 0x04 #define NL802154_FLAG_CHECK_NETDEV_UP 0x08 #define NL802154_FLAG_NEED_WPAN_DEV 0x10 static int nl802154_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev; struct wpan_dev *wpan_dev; struct net_device *dev; bool rtnl = ops->internal_flags & NL802154_FLAG_NEED_RTNL; if (rtnl) rtnl_lock(); if (ops->internal_flags & NL802154_FLAG_NEED_WPAN_PHY) { rdev = cfg802154_get_dev_from_info(genl_info_net(info), info); if (IS_ERR(rdev)) { if (rtnl) rtnl_unlock(); return PTR_ERR(rdev); } info->user_ptr[0] = rdev; } else if (ops->internal_flags & NL802154_FLAG_NEED_NETDEV || ops->internal_flags & NL802154_FLAG_NEED_WPAN_DEV) { ASSERT_RTNL(); wpan_dev = __cfg802154_wpan_dev_from_attrs(genl_info_net(info), info->attrs); if (IS_ERR(wpan_dev)) { if (rtnl) rtnl_unlock(); return PTR_ERR(wpan_dev); } dev = wpan_dev->netdev; rdev = wpan_phy_to_rdev(wpan_dev->wpan_phy); if (ops->internal_flags & NL802154_FLAG_NEED_NETDEV) { if (!dev) { if (rtnl) rtnl_unlock(); return -EINVAL; } info->user_ptr[1] = dev; } else { info->user_ptr[1] = wpan_dev; } if (dev) { if (ops->internal_flags & NL802154_FLAG_CHECK_NETDEV_UP && !netif_running(dev)) { if (rtnl) rtnl_unlock(); return -ENETDOWN; } dev_hold(dev); } info->user_ptr[0] = rdev; } return 0; } static void nl802154_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { if (info->user_ptr[1]) { if (ops->internal_flags & NL802154_FLAG_NEED_WPAN_DEV) { struct wpan_dev *wpan_dev = info->user_ptr[1]; dev_put(wpan_dev->netdev); } else { dev_put(info->user_ptr[1]); } } if (ops->internal_flags & NL802154_FLAG_NEED_RTNL) rtnl_unlock(); } static const struct genl_ops nl802154_ops[] = { { .cmd = NL802154_CMD_GET_WPAN_PHY, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP_STRICT, .doit = nl802154_get_wpan_phy, .dumpit = nl802154_dump_wpan_phy, .done = nl802154_dump_wpan_phy_done, /* can be retrieved by unprivileged users */ .internal_flags = NL802154_FLAG_NEED_WPAN_PHY | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_GET_INTERFACE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_get_interface, .dumpit = nl802154_dump_interface, /* can be retrieved by unprivileged users */ .internal_flags = NL802154_FLAG_NEED_WPAN_DEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_NEW_INTERFACE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_new_interface, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_WPAN_PHY | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_DEL_INTERFACE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_del_interface, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_WPAN_DEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_SET_CHANNEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_set_channel, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_WPAN_PHY | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_SET_CCA_MODE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_set_cca_mode, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_WPAN_PHY | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_SET_CCA_ED_LEVEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_set_cca_ed_level, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_WPAN_PHY | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_SET_TX_POWER, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_set_tx_power, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_WPAN_PHY | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_SET_WPAN_PHY_NETNS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_wpan_phy_netns, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_WPAN_PHY | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_SET_PAN_ID, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_set_pan_id, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_SET_SHORT_ADDR, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_set_short_addr, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_SET_BACKOFF_EXPONENT, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_set_backoff_exponent, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_SET_MAX_CSMA_BACKOFFS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_set_max_csma_backoffs, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_SET_MAX_FRAME_RETRIES, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_set_max_frame_retries, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_SET_LBT_MODE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_set_lbt_mode, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_SET_ACKREQ_DEFAULT, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_set_ackreq_default, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_TRIGGER_SCAN, .doit = nl802154_trigger_scan, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_CHECK_NETDEV_UP | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_ABORT_SCAN, .doit = nl802154_abort_scan, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_CHECK_NETDEV_UP | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_SEND_BEACONS, .doit = nl802154_send_beacons, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_CHECK_NETDEV_UP | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_STOP_BEACONS, .doit = nl802154_stop_beacons, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_CHECK_NETDEV_UP | NL802154_FLAG_NEED_RTNL, }, #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL { .cmd = NL802154_CMD_SET_SEC_PARAMS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_set_llsec_params, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_GET_SEC_KEY, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP_STRICT, /* TODO .doit by matching key id? */ .dumpit = nl802154_dump_llsec_key, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_NEW_SEC_KEY, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_add_llsec_key, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_DEL_SEC_KEY, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_del_llsec_key, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, /* TODO unique identifier must short+pan OR extended_addr */ { .cmd = NL802154_CMD_GET_SEC_DEV, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP_STRICT, /* TODO .doit by matching extended_addr? */ .dumpit = nl802154_dump_llsec_dev, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_NEW_SEC_DEV, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_add_llsec_dev, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_DEL_SEC_DEV, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_del_llsec_dev, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, /* TODO remove complete devkey, put it as nested? */ { .cmd = NL802154_CMD_GET_SEC_DEVKEY, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP_STRICT, /* TODO doit by matching ??? */ .dumpit = nl802154_dump_llsec_devkey, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_NEW_SEC_DEVKEY, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_add_llsec_devkey, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_DEL_SEC_DEVKEY, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_del_llsec_devkey, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_GET_SEC_LEVEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP_STRICT, /* TODO .doit by matching frame_type? */ .dumpit = nl802154_dump_llsec_seclevel, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_NEW_SEC_LEVEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_add_llsec_seclevel, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_DEL_SEC_LEVEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, /* TODO match frame_type only? */ .doit = nl802154_del_llsec_seclevel, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, #endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ }; static struct genl_family nl802154_fam __ro_after_init = { .name = NL802154_GENL_NAME, /* have users key off the name instead */ .hdrsize = 0, /* no private header */ .version = 1, /* no particular meaning now */ .maxattr = NL802154_ATTR_MAX, .policy = nl802154_policy, .netnsok = true, .pre_doit = nl802154_pre_doit, .post_doit = nl802154_post_doit, .module = THIS_MODULE, .ops = nl802154_ops, .n_ops = ARRAY_SIZE(nl802154_ops), .resv_start_op = NL802154_CMD_DEL_SEC_LEVEL + 1, .mcgrps = nl802154_mcgrps, .n_mcgrps = ARRAY_SIZE(nl802154_mcgrps), }; /* initialisation/exit functions */ int __init nl802154_init(void) { return genl_register_family(&nl802154_fam); } void nl802154_exit(void) { genl_unregister_family(&nl802154_fam); } |
3234 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2014 Felix Fietkau <nbd@nbd.name> * Copyright (C) 2004 - 2009 Ivo van Doorn <IvDoorn@gmail.com> */ #ifndef _LINUX_BITFIELD_H #define _LINUX_BITFIELD_H #include <linux/build_bug.h> #include <asm/byteorder.h> /* * Bitfield access macros * * FIELD_{GET,PREP} macros take as first parameter shifted mask * from which they extract the base mask and shift amount. * Mask must be a compilation time constant. * * Example: * * #include <linux/bitfield.h> * #include <linux/bits.h> * * #define REG_FIELD_A GENMASK(6, 0) * #define REG_FIELD_B BIT(7) * #define REG_FIELD_C GENMASK(15, 8) * #define REG_FIELD_D GENMASK(31, 16) * * Get: * a = FIELD_GET(REG_FIELD_A, reg); * b = FIELD_GET(REG_FIELD_B, reg); * * Set: * reg = FIELD_PREP(REG_FIELD_A, 1) | * FIELD_PREP(REG_FIELD_B, 0) | * FIELD_PREP(REG_FIELD_C, c) | * FIELD_PREP(REG_FIELD_D, 0x40); * * Modify: * reg &= ~REG_FIELD_C; * reg |= FIELD_PREP(REG_FIELD_C, c); */ #define __bf_shf(x) (__builtin_ffsll(x) - 1) #define __scalar_type_to_unsigned_cases(type) \ unsigned type: (unsigned type)0, \ signed type: (unsigned type)0 #define __unsigned_scalar_typeof(x) typeof( \ _Generic((x), \ char: (unsigned char)0, \ __scalar_type_to_unsigned_cases(char), \ __scalar_type_to_unsigned_cases(short), \ __scalar_type_to_unsigned_cases(int), \ __scalar_type_to_unsigned_cases(long), \ __scalar_type_to_unsigned_cases(long long), \ default: (x))) #define __bf_cast_unsigned(type, x) ((__unsigned_scalar_typeof(type))(x)) #define __BF_FIELD_CHECK(_mask, _reg, _val, _pfx) \ ({ \ BUILD_BUG_ON_MSG(!__builtin_constant_p(_mask), \ _pfx "mask is not constant"); \ BUILD_BUG_ON_MSG((_mask) == 0, _pfx "mask is zero"); \ BUILD_BUG_ON_MSG(__builtin_constant_p(_val) ? \ ~((_mask) >> __bf_shf(_mask)) & (_val) : 0, \ _pfx "value too large for the field"); \ BUILD_BUG_ON_MSG(__bf_cast_unsigned(_mask, _mask) > \ __bf_cast_unsigned(_reg, ~0ull), \ _pfx "type of reg too small for mask"); \ __BUILD_BUG_ON_NOT_POWER_OF_2((_mask) + \ (1ULL << __bf_shf(_mask))); \ }) /** * FIELD_MAX() - produce the maximum value representable by a field * @_mask: shifted mask defining the field's length and position * * FIELD_MAX() returns the maximum value that can be held in the field * specified by @_mask. */ #define FIELD_MAX(_mask) \ ({ \ __BF_FIELD_CHECK(_mask, 0ULL, 0ULL, "FIELD_MAX: "); \ (typeof(_mask))((_mask) >> __bf_shf(_mask)); \ }) /** * FIELD_FIT() - check if value fits in the field * @_mask: shifted mask defining the field's length and position * @_val: value to test against the field * * Return: true if @_val can fit inside @_mask, false if @_val is too big. */ #define FIELD_FIT(_mask, _val) \ ({ \ __BF_FIELD_CHECK(_mask, 0ULL, 0ULL, "FIELD_FIT: "); \ !((((typeof(_mask))_val) << __bf_shf(_mask)) & ~(_mask)); \ }) /** * FIELD_PREP() - prepare a bitfield element * @_mask: shifted mask defining the field's length and position * @_val: value to put in the field * * FIELD_PREP() masks and shifts up the value. The result should * be combined with other fields of the bitfield using logical OR. */ #define FIELD_PREP(_mask, _val) \ ({ \ __BF_FIELD_CHECK(_mask, 0ULL, _val, "FIELD_PREP: "); \ ((typeof(_mask))(_val) << __bf_shf(_mask)) & (_mask); \ }) #define __BF_CHECK_POW2(n) BUILD_BUG_ON_ZERO(((n) & ((n) - 1)) != 0) /** * FIELD_PREP_CONST() - prepare a constant bitfield element * @_mask: shifted mask defining the field's length and position * @_val: value to put in the field * * FIELD_PREP_CONST() masks and shifts up the value. The result should * be combined with other fields of the bitfield using logical OR. * * Unlike FIELD_PREP() this is a constant expression and can therefore * be used in initializers. Error checking is less comfortable for this * version, and non-constant masks cannot be used. */ #define FIELD_PREP_CONST(_mask, _val) \ ( \ /* mask must be non-zero */ \ BUILD_BUG_ON_ZERO((_mask) == 0) + \ /* check if value fits */ \ BUILD_BUG_ON_ZERO(~((_mask) >> __bf_shf(_mask)) & (_val)) + \ /* check if mask is contiguous */ \ __BF_CHECK_POW2((_mask) + (1ULL << __bf_shf(_mask))) + \ /* and create the value */ \ (((typeof(_mask))(_val) << __bf_shf(_mask)) & (_mask)) \ ) /** * FIELD_GET() - extract a bitfield element * @_mask: shifted mask defining the field's length and position * @_reg: value of entire bitfield * * FIELD_GET() extracts the field specified by @_mask from the * bitfield passed in as @_reg by masking and shifting it down. */ #define FIELD_GET(_mask, _reg) \ ({ \ __BF_FIELD_CHECK(_mask, _reg, 0U, "FIELD_GET: "); \ (typeof(_mask))(((_reg) & (_mask)) >> __bf_shf(_mask)); \ }) extern void __compiletime_error("value doesn't fit into mask") __field_overflow(void); extern void __compiletime_error("bad bitfield mask") __bad_mask(void); static __always_inline u64 field_multiplier(u64 field) { if ((field | (field - 1)) & ((field | (field - 1)) + 1)) __bad_mask(); return field & -field; } static __always_inline u64 field_mask(u64 field) { return field / field_multiplier(field); } #define field_max(field) ((typeof(field))field_mask(field)) #define ____MAKE_OP(type,base,to,from) \ static __always_inline __##type type##_encode_bits(base v, base field) \ { \ if (__builtin_constant_p(v) && (v & ~field_mask(field))) \ __field_overflow(); \ return to((v & field_mask(field)) * field_multiplier(field)); \ } \ static __always_inline __##type type##_replace_bits(__##type old, \ base val, base field) \ { \ return (old & ~to(field)) | type##_encode_bits(val, field); \ } \ static __always_inline void type##p_replace_bits(__##type *p, \ base val, base field) \ { \ *p = (*p & ~to(field)) | type##_encode_bits(val, field); \ } \ static __always_inline base type##_get_bits(__##type v, base field) \ { \ return (from(v) & field)/field_multiplier(field); \ } #define __MAKE_OP(size) \ ____MAKE_OP(le##size,u##size,cpu_to_le##size,le##size##_to_cpu) \ ____MAKE_OP(be##size,u##size,cpu_to_be##size,be##size##_to_cpu) \ ____MAKE_OP(u##size,u##size,,) ____MAKE_OP(u8,u8,,) __MAKE_OP(16) __MAKE_OP(32) __MAKE_OP(64) #undef __MAKE_OP #undef ____MAKE_OP #endif |
38 38 38 51 85 57 28 28 57 53 14 14 14 53 57 57 57 57 53 53 255 255 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 | // SPDX-License-Identifier: GPL-2.0 #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/net.h> #include <linux/fs.h> #include <net/af_unix.h> #include <net/scm.h> #include <linux/init.h> #include <linux/io_uring.h> #include "scm.h" unsigned int unix_tot_inflight; EXPORT_SYMBOL(unix_tot_inflight); LIST_HEAD(gc_inflight_list); EXPORT_SYMBOL(gc_inflight_list); DEFINE_SPINLOCK(unix_gc_lock); EXPORT_SYMBOL(unix_gc_lock); struct sock *unix_get_socket(struct file *filp) { struct sock *u_sock = NULL; struct inode *inode = file_inode(filp); /* Socket ? */ if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) { struct socket *sock = SOCKET_I(inode); const struct proto_ops *ops = READ_ONCE(sock->ops); struct sock *s = sock->sk; /* PF_UNIX ? */ if (s && ops && ops->family == PF_UNIX) u_sock = s; } else { /* Could be an io_uring instance */ u_sock = io_uring_get_socket(filp); } return u_sock; } EXPORT_SYMBOL(unix_get_socket); /* Keep the number of times in flight count for the file * descriptor if it is for an AF_UNIX socket. */ void unix_inflight(struct user_struct *user, struct file *fp) { struct sock *s = unix_get_socket(fp); spin_lock(&unix_gc_lock); if (s) { struct unix_sock *u = unix_sk(s); if (atomic_long_inc_return(&u->inflight) == 1) { BUG_ON(!list_empty(&u->link)); list_add_tail(&u->link, &gc_inflight_list); } else { BUG_ON(list_empty(&u->link)); } /* Paired with READ_ONCE() in wait_for_unix_gc() */ WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1); } WRITE_ONCE(user->unix_inflight, user->unix_inflight + 1); spin_unlock(&unix_gc_lock); } void unix_notinflight(struct user_struct *user, struct file *fp) { struct sock *s = unix_get_socket(fp); spin_lock(&unix_gc_lock); if (s) { struct unix_sock *u = unix_sk(s); BUG_ON(!atomic_long_read(&u->inflight)); BUG_ON(list_empty(&u->link)); if (atomic_long_dec_and_test(&u->inflight)) list_del_init(&u->link); /* Paired with READ_ONCE() in wait_for_unix_gc() */ WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1); } WRITE_ONCE(user->unix_inflight, user->unix_inflight - 1); spin_unlock(&unix_gc_lock); } /* * The "user->unix_inflight" variable is protected by the garbage * collection lock, and we just read it locklessly here. If you go * over the limit, there might be a tiny race in actually noticing * it across threads. Tough. */ static inline bool too_many_unix_fds(struct task_struct *p) { struct user_struct *user = current_user(); if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE))) return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); return false; } int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) { int i; if (too_many_unix_fds(current)) return -ETOOMANYREFS; /* * Need to duplicate file references for the sake of garbage * collection. Otherwise a socket in the fps might become a * candidate for GC while the skb is not yet queued. */ UNIXCB(skb).fp = scm_fp_dup(scm->fp); if (!UNIXCB(skb).fp) return -ENOMEM; for (i = scm->fp->count - 1; i >= 0; i--) unix_inflight(scm->fp->user, scm->fp->fp[i]); return 0; } EXPORT_SYMBOL(unix_attach_fds); void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) { int i; scm->fp = UNIXCB(skb).fp; UNIXCB(skb).fp = NULL; for (i = scm->fp->count-1; i >= 0; i--) unix_notinflight(scm->fp->user, scm->fp->fp[i]); } EXPORT_SYMBOL(unix_detach_fds); void unix_destruct_scm(struct sk_buff *skb) { struct scm_cookie scm; memset(&scm, 0, sizeof(scm)); scm.pid = UNIXCB(skb).pid; if (UNIXCB(skb).fp) unix_detach_fds(&scm, skb); /* Alas, it calls VFS */ /* So fscking what? fput() had been SMP-safe since the last Summer */ scm_destroy(&scm); sock_wfree(skb); } EXPORT_SYMBOL(unix_destruct_scm); void io_uring_destruct_scm(struct sk_buff *skb) { unix_destruct_scm(skb); } EXPORT_SYMBOL(io_uring_destruct_scm); |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_KSM_H #define __LINUX_KSM_H /* * Memory merging support. * * This code enables dynamic sharing of identical pages found in different * memory areas, even if they are not shared by fork(). */ #include <linux/bitops.h> #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/rmap.h> #include <linux/sched.h> #include <linux/sched/coredump.h> #ifdef CONFIG_KSM int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, unsigned long *vm_flags); void ksm_add_vma(struct vm_area_struct *vma); int ksm_enable_merge_any(struct mm_struct *mm); int ksm_disable_merge_any(struct mm_struct *mm); int ksm_disable(struct mm_struct *mm); int __ksm_enter(struct mm_struct *mm); void __ksm_exit(struct mm_struct *mm); /* * To identify zeropages that were mapped by KSM, we reuse the dirty bit * in the PTE. If the PTE is dirty, the zeropage was mapped by KSM when * deduplicating memory. */ #define is_ksm_zero_pte(pte) (is_zero_pfn(pte_pfn(pte)) && pte_dirty(pte)) extern unsigned long ksm_zero_pages; static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte) { if (is_ksm_zero_pte(pte)) { ksm_zero_pages--; mm->ksm_zero_pages--; } } static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { int ret; if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) { ret = __ksm_enter(mm); if (ret) return ret; } if (test_bit(MMF_VM_MERGE_ANY, &oldmm->flags)) set_bit(MMF_VM_MERGE_ANY, &mm->flags); return 0; } static inline void ksm_exit(struct mm_struct *mm) { if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) __ksm_exit(mm); } /* * When do_swap_page() first faults in from swap what used to be a KSM page, * no problem, it will be assigned to this vma's anon_vma; but thereafter, * it might be faulted into a different anon_vma (or perhaps to a different * offset in the same anon_vma). do_swap_page() cannot do all the locking * needed to reconstitute a cross-anon_vma KSM page: for now it has to make * a copy, and leave remerging the pages to a later pass of ksmd. * * We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE, * but what if the vma was unmerged while the page was swapped out? */ struct page *ksm_might_need_to_copy(struct page *page, struct vm_area_struct *vma, unsigned long address); void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc); void folio_migrate_ksm(struct folio *newfolio, struct folio *folio); #ifdef CONFIG_MEMORY_FAILURE void collect_procs_ksm(struct page *page, struct list_head *to_kill, int force_early); #endif #ifdef CONFIG_PROC_FS long ksm_process_profit(struct mm_struct *); #endif /* CONFIG_PROC_FS */ #else /* !CONFIG_KSM */ static inline void ksm_add_vma(struct vm_area_struct *vma) { } static inline int ksm_disable(struct mm_struct *mm) { return 0; } static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { return 0; } static inline void ksm_exit(struct mm_struct *mm) { } static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte) { } #ifdef CONFIG_MEMORY_FAILURE static inline void collect_procs_ksm(struct page *page, struct list_head *to_kill, int force_early) { } #endif #ifdef CONFIG_MMU static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, unsigned long *vm_flags) { return 0; } static inline struct page *ksm_might_need_to_copy(struct page *page, struct vm_area_struct *vma, unsigned long address) { return page; } static inline void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc) { } static inline void folio_migrate_ksm(struct folio *newfolio, struct folio *old) { } #endif /* CONFIG_MMU */ #endif /* !CONFIG_KSM */ #endif /* __LINUX_KSM_H */ |
9252 9253 5803 5803 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 | // SPDX-License-Identifier: GPL-2.0 #include <linux/err.h> #include <linux/bug.h> #include <linux/atomic.h> #include <linux/errseq.h> #include <linux/log2.h> /* * An errseq_t is a way of recording errors in one place, and allowing any * number of "subscribers" to tell whether it has changed since a previous * point where it was sampled. * * It's implemented as an unsigned 32-bit value. The low order bits are * designated to hold an error code (between 0 and -MAX_ERRNO). The upper bits * are used as a counter. This is done with atomics instead of locking so that * these functions can be called from any context. * * The general idea is for consumers to sample an errseq_t value. That value * can later be used to tell whether any new errors have occurred since that * sampling was done. * * Note that there is a risk of collisions if new errors are being recorded * frequently, since we have so few bits to use as a counter. * * To mitigate this, one bit is used as a flag to tell whether the value has * been sampled since a new value was recorded. That allows us to avoid bumping * the counter if no one has sampled it since the last time an error was * recorded. * * A new errseq_t should always be zeroed out. A errseq_t value of all zeroes * is the special (but common) case where there has never been an error. An all * zero value thus serves as the "epoch" if one wishes to know whether there * has ever been an error set since it was first initialized. */ /* The low bits are designated for error code (max of MAX_ERRNO) */ #define ERRSEQ_SHIFT ilog2(MAX_ERRNO + 1) /* This bit is used as a flag to indicate whether the value has been seen */ #define ERRSEQ_SEEN (1 << ERRSEQ_SHIFT) /* The lowest bit of the counter */ #define ERRSEQ_CTR_INC (1 << (ERRSEQ_SHIFT + 1)) /** * errseq_set - set a errseq_t for later reporting * @eseq: errseq_t field that should be set * @err: error to set (must be between -1 and -MAX_ERRNO) * * This function sets the error in @eseq, and increments the sequence counter * if the last sequence was sampled at some point in the past. * * Any error set will always overwrite an existing error. * * Return: The previous value, primarily for debugging purposes. The * return value should not be used as a previously sampled value in later * calls as it will not have the SEEN flag set. */ errseq_t errseq_set(errseq_t *eseq, int err) { errseq_t cur, old; /* MAX_ERRNO must be able to serve as a mask */ BUILD_BUG_ON_NOT_POWER_OF_2(MAX_ERRNO + 1); /* * Ensure the error code actually fits where we want it to go. If it * doesn't then just throw a warning and don't record anything. We * also don't accept zero here as that would effectively clear a * previous error. */ old = READ_ONCE(*eseq); if (WARN(unlikely(err == 0 || (unsigned int)-err > MAX_ERRNO), "err = %d\n", err)) return old; for (;;) { errseq_t new; /* Clear out error bits and set new error */ new = (old & ~(MAX_ERRNO|ERRSEQ_SEEN)) | -err; /* Only increment if someone has looked at it */ if (old & ERRSEQ_SEEN) new += ERRSEQ_CTR_INC; /* If there would be no change, then call it done */ if (new == old) { cur = new; break; } /* Try to swap the new value into place */ cur = cmpxchg(eseq, old, new); /* * Call it success if we did the swap or someone else beat us * to it for the same value. */ if (likely(cur == old || cur == new)) break; /* Raced with an update, try again */ old = cur; } return cur; } EXPORT_SYMBOL(errseq_set); /** * errseq_sample() - Grab current errseq_t value. * @eseq: Pointer to errseq_t to be sampled. * * This function allows callers to initialise their errseq_t variable. * If the error has been "seen", new callers will not see an old error. * If there is an unseen error in @eseq, the caller of this function will * see it the next time it checks for an error. * * Context: Any context. * Return: The current errseq value. */ errseq_t errseq_sample(errseq_t *eseq) { errseq_t old = READ_ONCE(*eseq); /* If nobody has seen this error yet, then we can be the first. */ if (!(old & ERRSEQ_SEEN)) old = 0; return old; } EXPORT_SYMBOL(errseq_sample); /** * errseq_check() - Has an error occurred since a particular sample point? * @eseq: Pointer to errseq_t value to be checked. * @since: Previously-sampled errseq_t from which to check. * * Grab the value that eseq points to, and see if it has changed @since * the given value was sampled. The @since value is not advanced, so there * is no need to mark the value as seen. * * Return: The latest error set in the errseq_t or 0 if it hasn't changed. */ int errseq_check(errseq_t *eseq, errseq_t since) { errseq_t cur = READ_ONCE(*eseq); if (likely(cur == since)) return 0; return -(cur & MAX_ERRNO); } EXPORT_SYMBOL(errseq_check); /** * errseq_check_and_advance() - Check an errseq_t and advance to current value. * @eseq: Pointer to value being checked and reported. * @since: Pointer to previously-sampled errseq_t to check against and advance. * * Grab the eseq value, and see whether it matches the value that @since * points to. If it does, then just return 0. * * If it doesn't, then the value has changed. Set the "seen" flag, and try to * swap it into place as the new eseq value. Then, set that value as the new * "since" value, and return whatever the error portion is set to. * * Note that no locking is provided here for concurrent updates to the "since" * value. The caller must provide that if necessary. Because of this, callers * may want to do a lockless errseq_check before taking the lock and calling * this. * * Return: Negative errno if one has been stored, or 0 if no new error has * occurred. */ int errseq_check_and_advance(errseq_t *eseq, errseq_t *since) { int err = 0; errseq_t old, new; /* * Most callers will want to use the inline wrapper to check this, * so that the common case of no error is handled without needing * to take the lock that protects the "since" value. */ old = READ_ONCE(*eseq); if (old != *since) { /* * Set the flag and try to swap it into place if it has * changed. * * We don't care about the outcome of the swap here. If the * swap doesn't occur, then it has either been updated by a * writer who is altering the value in some way (updating * counter or resetting the error), or another reader who is * just setting the "seen" flag. Either outcome is OK, and we * can advance "since" and return an error based on what we * have. */ new = old | ERRSEQ_SEEN; if (new != old) cmpxchg(eseq, old, new); *since = new; err = -(new & MAX_ERRNO); } return err; } EXPORT_SYMBOL(errseq_check_and_advance); |
482 486 354 43 36 147 169 154 36 33 43 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 | /* * net/tipc/msg.h: Include file for TIPC message header routines * * Copyright (c) 2000-2007, 2014-2017 Ericsson AB * Copyright (c) 2005-2008, 2010-2011, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _TIPC_MSG_H #define _TIPC_MSG_H #include <linux/tipc.h> #include "core.h" /* * Constants and routines used to read and write TIPC payload message headers * * Note: Some items are also used with TIPC internal message headers */ #define TIPC_VERSION 2 struct plist; /* * Payload message users are defined in TIPC's public API: * - TIPC_LOW_IMPORTANCE * - TIPC_MEDIUM_IMPORTANCE * - TIPC_HIGH_IMPORTANCE * - TIPC_CRITICAL_IMPORTANCE */ #define TIPC_SYSTEM_IMPORTANCE 4 /* * Payload message types */ #define TIPC_CONN_MSG 0 #define TIPC_MCAST_MSG 1 #define TIPC_NAMED_MSG 2 #define TIPC_DIRECT_MSG 3 #define TIPC_GRP_MEMBER_EVT 4 #define TIPC_GRP_BCAST_MSG 5 #define TIPC_GRP_MCAST_MSG 6 #define TIPC_GRP_UCAST_MSG 7 /* * Internal message users */ #define BCAST_PROTOCOL 5 #define MSG_BUNDLER 6 #define LINK_PROTOCOL 7 #define CONN_MANAGER 8 #define GROUP_PROTOCOL 9 #define TUNNEL_PROTOCOL 10 #define NAME_DISTRIBUTOR 11 #define MSG_FRAGMENTER 12 #define LINK_CONFIG 13 #define MSG_CRYPTO 14 #define SOCK_WAKEUP 14 /* pseudo user */ #define TOP_SRV 15 /* pseudo user */ /* * Message header sizes */ #define SHORT_H_SIZE 24 /* In-cluster basic payload message */ #define BASIC_H_SIZE 32 /* Basic payload message */ #define NAMED_H_SIZE 40 /* Named payload message */ #define MCAST_H_SIZE 44 /* Multicast payload message */ #define GROUP_H_SIZE 44 /* Group payload message */ #define INT_H_SIZE 40 /* Internal messages */ #define MIN_H_SIZE 24 /* Smallest legal TIPC header size */ #define MAX_H_SIZE 60 /* Largest possible TIPC header size */ #define MAX_MSG_SIZE (MAX_H_SIZE + TIPC_MAX_USER_MSG_SIZE) #define TIPC_MEDIA_INFO_OFFSET 5 extern const int one_page_mtu; struct tipc_skb_cb { union { struct { struct sk_buff *tail; unsigned long nxt_retr; unsigned long retr_stamp; u32 bytes_read; u32 orig_member; u16 chain_imp; u16 ackers; u16 retr_cnt; } __packed; #ifdef CONFIG_TIPC_CRYPTO struct { struct tipc_crypto *rx; struct tipc_aead *last; u8 recurs; } tx_clone_ctx __packed; #endif } __packed; union { struct { u8 validated:1; #ifdef CONFIG_TIPC_CRYPTO u8 encrypted:1; u8 decrypted:1; #define SKB_PROBING 1 #define SKB_GRACING 2 u8 xmit_type:2; u8 tx_clone_deferred:1; #endif }; u8 flags; }; u8 reserved; #ifdef CONFIG_TIPC_CRYPTO void *crypto_ctx; #endif } __packed; #define TIPC_SKB_CB(__skb) ((struct tipc_skb_cb *)&((__skb)->cb[0])) struct tipc_msg { __be32 hdr[15]; }; /* struct tipc_gap_ack - TIPC Gap ACK block * @ack: seqno of the last consecutive packet in link deferdq * @gap: number of gap packets since the last ack * * E.g: * link deferdq: 1 2 3 4 10 11 13 14 15 20 * --> Gap ACK blocks: <4, 5>, <11, 1>, <15, 4>, <20, 0> */ struct tipc_gap_ack { __be16 ack; __be16 gap; }; /* struct tipc_gap_ack_blks * @len: actual length of the record * @ugack_cnt: number of Gap ACK blocks for unicast (following the broadcast * ones) * @start_index: starting index for "valid" broadcast Gap ACK blocks * @bgack_cnt: number of Gap ACK blocks for broadcast in the record * @gacks: array of Gap ACK blocks * * 31 16 15 0 * +-------------+-------------+-------------+-------------+ * | bgack_cnt | ugack_cnt | len | * +-------------+-------------+-------------+-------------+ - * | gap | ack | | * +-------------+-------------+-------------+-------------+ > bc gacks * : : : | * +-------------+-------------+-------------+-------------+ - * | gap | ack | | * +-------------+-------------+-------------+-------------+ > uc gacks * : : : | * +-------------+-------------+-------------+-------------+ - */ struct tipc_gap_ack_blks { __be16 len; union { u8 ugack_cnt; u8 start_index; }; u8 bgack_cnt; struct tipc_gap_ack gacks[]; }; #define MAX_GAP_ACK_BLKS 128 #define MAX_GAP_ACK_BLKS_SZ (sizeof(struct tipc_gap_ack_blks) + \ sizeof(struct tipc_gap_ack) * MAX_GAP_ACK_BLKS) static inline struct tipc_msg *buf_msg(struct sk_buff *skb) { return (struct tipc_msg *)skb->data; } static inline u32 msg_word(struct tipc_msg *m, u32 pos) { return ntohl(m->hdr[pos]); } static inline void msg_set_word(struct tipc_msg *m, u32 w, u32 val) { m->hdr[w] = htonl(val); } static inline u32 msg_bits(struct tipc_msg *m, u32 w, u32 pos, u32 mask) { return (msg_word(m, w) >> pos) & mask; } static inline void msg_set_bits(struct tipc_msg *m, u32 w, u32 pos, u32 mask, u32 val) { val = (val & mask) << pos; mask = mask << pos; m->hdr[w] &= ~htonl(mask); m->hdr[w] |= htonl(val); } /* * Word 0 */ static inline u32 msg_version(struct tipc_msg *m) { return msg_bits(m, 0, 29, 7); } static inline void msg_set_version(struct tipc_msg *m) { msg_set_bits(m, 0, 29, 7, TIPC_VERSION); } static inline u32 msg_user(struct tipc_msg *m) { return msg_bits(m, 0, 25, 0xf); } static inline u32 msg_isdata(struct tipc_msg *m) { return msg_user(m) <= TIPC_CRITICAL_IMPORTANCE; } static inline void msg_set_user(struct tipc_msg *m, u32 n) { msg_set_bits(m, 0, 25, 0xf, n); } static inline u32 msg_hdr_sz(struct tipc_msg *m) { return msg_bits(m, 0, 21, 0xf) << 2; } static inline void msg_set_hdr_sz(struct tipc_msg *m, u32 n) { msg_set_bits(m, 0, 21, 0xf, n>>2); } static inline u32 msg_size(struct tipc_msg *m) { return msg_bits(m, 0, 0, 0x1ffff); } static inline u32 msg_blocks(struct tipc_msg *m) { return (msg_size(m) / 1024) + 1; } static inline u32 msg_data_sz(struct tipc_msg *m) { return msg_size(m) - msg_hdr_sz(m); } static inline int msg_non_seq(struct tipc_msg *m) { return msg_bits(m, 0, 20, 1); } static inline void msg_set_non_seq(struct tipc_msg *m, u32 n) { msg_set_bits(m, 0, 20, 1, n); } static inline int msg_is_syn(struct tipc_msg *m) { return msg_bits(m, 0, 17, 1); } static inline void msg_set_syn(struct tipc_msg *m, u32 d) { msg_set_bits(m, 0, 17, 1, d); } static inline int msg_dest_droppable(struct tipc_msg *m) { return msg_bits(m, 0, 19, 1); } static inline void msg_set_dest_droppable(struct tipc_msg *m, u32 d) { msg_set_bits(m, 0, 19, 1, d); } static inline int msg_is_keepalive(struct tipc_msg *m) { return msg_bits(m, 0, 19, 1); } static inline void msg_set_is_keepalive(struct tipc_msg *m, u32 d) { msg_set_bits(m, 0, 19, 1, d); } static inline int msg_src_droppable(struct tipc_msg *m) { return msg_bits(m, 0, 18, 1); } static inline void msg_set_src_droppable(struct tipc_msg *m, u32 d) { msg_set_bits(m, 0, 18, 1, d); } static inline int msg_ack_required(struct tipc_msg *m) { return msg_bits(m, 0, 18, 1); } static inline void msg_set_ack_required(struct tipc_msg *m) { msg_set_bits(m, 0, 18, 1, 1); } static inline int msg_nagle_ack(struct tipc_msg *m) { return msg_bits(m, 0, 18, 1); } static inline void msg_set_nagle_ack(struct tipc_msg *m) { msg_set_bits(m, 0, 18, 1, 1); } static inline bool msg_is_rcast(struct tipc_msg *m) { return msg_bits(m, 0, 18, 0x1); } static inline void msg_set_is_rcast(struct tipc_msg *m, bool d) { msg_set_bits(m, 0, 18, 0x1, d); } static inline void msg_set_size(struct tipc_msg *m, u32 sz) { m->hdr[0] = htonl((msg_word(m, 0) & ~0x1ffff) | sz); } static inline unchar *msg_data(struct tipc_msg *m) { return ((unchar *)m) + msg_hdr_sz(m); } static inline struct tipc_msg *msg_inner_hdr(struct tipc_msg *m) { return (struct tipc_msg *)msg_data(m); } /* * Word 1 */ static inline u32 msg_type(struct tipc_msg *m) { return msg_bits(m, 1, 29, 0x7); } static inline void msg_set_type(struct tipc_msg *m, u32 n) { msg_set_bits(m, 1, 29, 0x7, n); } static inline int msg_in_group(struct tipc_msg *m) { int mtyp = msg_type(m); return mtyp >= TIPC_GRP_MEMBER_EVT && mtyp <= TIPC_GRP_UCAST_MSG; } static inline bool msg_is_grp_evt(struct tipc_msg *m) { return msg_type(m) == TIPC_GRP_MEMBER_EVT; } static inline u32 msg_named(struct tipc_msg *m) { return msg_type(m) == TIPC_NAMED_MSG; } static inline u32 msg_mcast(struct tipc_msg *m) { int mtyp = msg_type(m); return ((mtyp == TIPC_MCAST_MSG) || (mtyp == TIPC_GRP_BCAST_MSG) || (mtyp == TIPC_GRP_MCAST_MSG)); } static inline u32 msg_connected(struct tipc_msg *m) { return msg_type(m) == TIPC_CONN_MSG; } static inline u32 msg_direct(struct tipc_msg *m) { return msg_type(m) == TIPC_DIRECT_MSG; } static inline u32 msg_errcode(struct tipc_msg *m) { return msg_bits(m, 1, 25, 0xf); } static inline void msg_set_errcode(struct tipc_msg *m, u32 err) { msg_set_bits(m, 1, 25, 0xf, err); } static inline void msg_set_bulk(struct tipc_msg *m) { msg_set_bits(m, 1, 28, 0x1, 1); } static inline u32 msg_is_bulk(struct tipc_msg *m) { return msg_bits(m, 1, 28, 0x1); } static inline void msg_set_last_bulk(struct tipc_msg *m) { msg_set_bits(m, 1, 27, 0x1, 1); } static inline u32 msg_is_last_bulk(struct tipc_msg *m) { return msg_bits(m, 1, 27, 0x1); } static inline void msg_set_non_legacy(struct tipc_msg *m) { msg_set_bits(m, 1, 26, 0x1, 1); } static inline u32 msg_is_legacy(struct tipc_msg *m) { return !msg_bits(m, 1, 26, 0x1); } static inline u32 msg_reroute_cnt(struct tipc_msg *m) { return msg_bits(m, 1, 21, 0xf); } static inline void msg_incr_reroute_cnt(struct tipc_msg *m) { msg_set_bits(m, 1, 21, 0xf, msg_reroute_cnt(m) + 1); } static inline u32 msg_lookup_scope(struct tipc_msg *m) { return msg_bits(m, 1, 19, 0x3); } static inline void msg_set_lookup_scope(struct tipc_msg *m, u32 n) { msg_set_bits(m, 1, 19, 0x3, n); } static inline u16 msg_bcast_ack(struct tipc_msg *m) { return msg_bits(m, 1, 0, 0xffff); } static inline void msg_set_bcast_ack(struct tipc_msg *m, u16 n) { msg_set_bits(m, 1, 0, 0xffff, n); } /* Note: reusing bits in word 1 for ACTIVATE_MSG only, to re-synch * link peer session number */ static inline bool msg_dest_session_valid(struct tipc_msg *m) { return msg_bits(m, 1, 16, 0x1); } static inline void msg_set_dest_session_valid(struct tipc_msg *m, bool valid) { msg_set_bits(m, 1, 16, 0x1, valid); } static inline u16 msg_dest_session(struct tipc_msg *m) { return msg_bits(m, 1, 0, 0xffff); } static inline void msg_set_dest_session(struct tipc_msg *m, u16 n) { msg_set_bits(m, 1, 0, 0xffff, n); } /* * Word 2 */ static inline u16 msg_ack(struct tipc_msg *m) { return msg_bits(m, 2, 16, 0xffff); } static inline void msg_set_ack(struct tipc_msg *m, u16 n) { msg_set_bits(m, 2, 16, 0xffff, n); } static inline u16 msg_seqno(struct tipc_msg *m) { return msg_bits(m, 2, 0, 0xffff); } static inline void msg_set_seqno(struct tipc_msg *m, u16 n) { msg_set_bits(m, 2, 0, 0xffff, n); } /* * Words 3-10 */ static inline u32 msg_importance(struct tipc_msg *m) { int usr = msg_user(m); if (likely((usr <= TIPC_CRITICAL_IMPORTANCE) && !msg_errcode(m))) return usr; if ((usr == MSG_FRAGMENTER) || (usr == MSG_BUNDLER)) return msg_bits(m, 9, 0, 0x7); return TIPC_SYSTEM_IMPORTANCE; } static inline void msg_set_importance(struct tipc_msg *m, u32 i) { int usr = msg_user(m); if (likely((usr == MSG_FRAGMENTER) || (usr == MSG_BUNDLER))) msg_set_bits(m, 9, 0, 0x7, i); else if (i < TIPC_SYSTEM_IMPORTANCE) msg_set_user(m, i); else pr_warn("Trying to set illegal importance in message\n"); } static inline u32 msg_prevnode(struct tipc_msg *m) { return msg_word(m, 3); } static inline void msg_set_prevnode(struct tipc_msg *m, u32 a) { msg_set_word(m, 3, a); } static inline u32 msg_origport(struct tipc_msg *m) { if (msg_user(m) == MSG_FRAGMENTER) m = msg_inner_hdr(m); return msg_word(m, 4); } static inline void msg_set_origport(struct tipc_msg *m, u32 p) { msg_set_word(m, 4, p); } static inline u16 msg_named_seqno(struct tipc_msg *m) { return msg_bits(m, 4, 0, 0xffff); } static inline void msg_set_named_seqno(struct tipc_msg *m, u16 n) { msg_set_bits(m, 4, 0, 0xffff, n); } static inline u32 msg_destport(struct tipc_msg *m) { return msg_word(m, 5); } static inline void msg_set_destport(struct tipc_msg *m, u32 p) { msg_set_word(m, 5, p); } static inline u32 msg_mc_netid(struct tipc_msg *m) { return msg_word(m, 5); } static inline void msg_set_mc_netid(struct tipc_msg *m, u32 p) { msg_set_word(m, 5, p); } static inline int msg_short(struct tipc_msg *m) { return msg_hdr_sz(m) == SHORT_H_SIZE; } static inline u32 msg_orignode(struct tipc_msg *m) { if (likely(msg_short(m))) return msg_prevnode(m); return msg_word(m, 6); } static inline void msg_set_orignode(struct tipc_msg *m, u32 a) { msg_set_word(m, 6, a); } static inline u32 msg_destnode(struct tipc_msg *m) { return msg_word(m, 7); } static inline void msg_set_destnode(struct tipc_msg *m, u32 a) { msg_set_word(m, 7, a); } static inline u32 msg_nametype(struct tipc_msg *m) { return msg_word(m, 8); } static inline void msg_set_nametype(struct tipc_msg *m, u32 n) { msg_set_word(m, 8, n); } static inline u32 msg_nameinst(struct tipc_msg *m) { return msg_word(m, 9); } static inline u32 msg_namelower(struct tipc_msg *m) { return msg_nameinst(m); } static inline void msg_set_namelower(struct tipc_msg *m, u32 n) { msg_set_word(m, 9, n); } static inline void msg_set_nameinst(struct tipc_msg *m, u32 n) { msg_set_namelower(m, n); } static inline u32 msg_nameupper(struct tipc_msg *m) { return msg_word(m, 10); } static inline void msg_set_nameupper(struct tipc_msg *m, u32 n) { msg_set_word(m, 10, n); } /* * Constants and routines used to read and write TIPC internal message headers */ /* * Connection management protocol message types */ #define CONN_PROBE 0 #define CONN_PROBE_REPLY 1 #define CONN_ACK 2 /* * Name distributor message types */ #define PUBLICATION 0 #define WITHDRAWAL 1 /* * Segmentation message types */ #define FIRST_FRAGMENT 0 #define FRAGMENT 1 #define LAST_FRAGMENT 2 /* * Link management protocol message types */ #define STATE_MSG 0 #define RESET_MSG 1 #define ACTIVATE_MSG 2 /* * Changeover tunnel message types */ #define SYNCH_MSG 0 #define FAILOVER_MSG 1 /* * Config protocol message types */ #define DSC_REQ_MSG 0 #define DSC_RESP_MSG 1 #define DSC_TRIAL_MSG 2 #define DSC_TRIAL_FAIL_MSG 3 /* * Group protocol message types */ #define GRP_JOIN_MSG 0 #define GRP_LEAVE_MSG 1 #define GRP_ADV_MSG 2 #define GRP_ACK_MSG 3 #define GRP_RECLAIM_MSG 4 #define GRP_REMIT_MSG 5 /* Crypto message types */ #define KEY_DISTR_MSG 0 /* * Word 1 */ static inline u32 msg_seq_gap(struct tipc_msg *m) { return msg_bits(m, 1, 16, 0x1fff); } static inline void msg_set_seq_gap(struct tipc_msg *m, u32 n) { msg_set_bits(m, 1, 16, 0x1fff, n); } static inline u32 msg_node_sig(struct tipc_msg *m) { return msg_bits(m, 1, 0, 0xffff); } static inline void msg_set_node_sig(struct tipc_msg *m, u32 n) { msg_set_bits(m, 1, 0, 0xffff, n); } static inline u32 msg_node_capabilities(struct tipc_msg *m) { return msg_bits(m, 1, 15, 0x1fff); } static inline void msg_set_node_capabilities(struct tipc_msg *m, u32 n) { msg_set_bits(m, 1, 15, 0x1fff, n); } /* * Word 2 */ static inline u32 msg_dest_domain(struct tipc_msg *m) { return msg_word(m, 2); } static inline void msg_set_dest_domain(struct tipc_msg *m, u32 n) { msg_set_word(m, 2, n); } static inline void msg_set_bcgap_after(struct tipc_msg *m, u32 n) { msg_set_bits(m, 2, 16, 0xffff, n); } static inline u32 msg_bcgap_to(struct tipc_msg *m) { return msg_bits(m, 2, 0, 0xffff); } static inline void msg_set_bcgap_to(struct tipc_msg *m, u32 n) { msg_set_bits(m, 2, 0, 0xffff, n); } /* * Word 4 */ static inline u32 msg_last_bcast(struct tipc_msg *m) { return msg_bits(m, 4, 16, 0xffff); } static inline u32 msg_bc_snd_nxt(struct tipc_msg *m) { return msg_last_bcast(m) + 1; } static inline void msg_set_last_bcast(struct tipc_msg *m, u32 n) { msg_set_bits(m, 4, 16, 0xffff, n); } static inline u32 msg_nof_fragms(struct tipc_msg *m) { return msg_bits(m, 4, 0, 0xffff); } static inline void msg_set_nof_fragms(struct tipc_msg *m, u32 n) { msg_set_bits(m, 4, 0, 0xffff, n); } static inline u32 msg_fragm_no(struct tipc_msg *m) { return msg_bits(m, 4, 16, 0xffff); } static inline void msg_set_fragm_no(struct tipc_msg *m, u32 n) { msg_set_bits(m, 4, 16, 0xffff, n); } static inline u16 msg_next_sent(struct tipc_msg *m) { return msg_bits(m, 4, 0, 0xffff); } static inline void msg_set_next_sent(struct tipc_msg *m, u16 n) { msg_set_bits(m, 4, 0, 0xffff, n); } static inline u32 msg_bc_netid(struct tipc_msg *m) { return msg_word(m, 4); } static inline void msg_set_bc_netid(struct tipc_msg *m, u32 id) { msg_set_word(m, 4, id); } static inline u32 msg_link_selector(struct tipc_msg *m) { if (msg_user(m) == MSG_FRAGMENTER) m = (void *)msg_data(m); return msg_bits(m, 4, 0, 1); } /* * Word 5 */ static inline u16 msg_session(struct tipc_msg *m) { return msg_bits(m, 5, 16, 0xffff); } static inline void msg_set_session(struct tipc_msg *m, u16 n) { msg_set_bits(m, 5, 16, 0xffff, n); } static inline u32 msg_probe(struct tipc_msg *m) { return msg_bits(m, 5, 0, 1); } static inline void msg_set_probe(struct tipc_msg *m, u32 val) { msg_set_bits(m, 5, 0, 1, val); } static inline char msg_net_plane(struct tipc_msg *m) { return msg_bits(m, 5, 1, 7) + 'A'; } static inline void msg_set_net_plane(struct tipc_msg *m, char n) { msg_set_bits(m, 5, 1, 7, (n - 'A')); } static inline u32 msg_linkprio(struct tipc_msg *m) { return msg_bits(m, 5, 4, 0x1f); } static inline void msg_set_linkprio(struct tipc_msg *m, u32 n) { msg_set_bits(m, 5, 4, 0x1f, n); } static inline u32 msg_bearer_id(struct tipc_msg *m) { return msg_bits(m, 5, 9, 0x7); } static inline void msg_set_bearer_id(struct tipc_msg *m, u32 n) { msg_set_bits(m, 5, 9, 0x7, n); } static inline u32 msg_redundant_link(struct tipc_msg *m) { return msg_bits(m, 5, 12, 0x1); } static inline void msg_set_redundant_link(struct tipc_msg *m, u32 r) { msg_set_bits(m, 5, 12, 0x1, r); } static inline u32 msg_peer_stopping(struct tipc_msg *m) { return msg_bits(m, 5, 13, 0x1); } static inline void msg_set_peer_stopping(struct tipc_msg *m, u32 s) { msg_set_bits(m, 5, 13, 0x1, s); } static inline bool msg_bc_ack_invalid(struct tipc_msg *m) { switch (msg_user(m)) { case BCAST_PROTOCOL: case NAME_DISTRIBUTOR: case LINK_PROTOCOL: return msg_bits(m, 5, 14, 0x1); default: return false; } } static inline void msg_set_bc_ack_invalid(struct tipc_msg *m, bool invalid) { msg_set_bits(m, 5, 14, 0x1, invalid); } static inline char *msg_media_addr(struct tipc_msg *m) { return (char *)&m->hdr[TIPC_MEDIA_INFO_OFFSET]; } static inline u32 msg_bc_gap(struct tipc_msg *m) { return msg_bits(m, 8, 0, 0x3ff); } static inline void msg_set_bc_gap(struct tipc_msg *m, u32 n) { msg_set_bits(m, 8, 0, 0x3ff, n); } /* * Word 9 */ static inline u16 msg_msgcnt(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } static inline void msg_set_msgcnt(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 16, 0xffff, n); } static inline u16 msg_syncpt(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } static inline void msg_set_syncpt(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 16, 0xffff, n); } static inline u32 msg_conn_ack(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } static inline void msg_set_conn_ack(struct tipc_msg *m, u32 n) { msg_set_bits(m, 9, 16, 0xffff, n); } static inline u16 msg_adv_win(struct tipc_msg *m) { return msg_bits(m, 9, 0, 0xffff); } static inline void msg_set_adv_win(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 0, 0xffff, n); } static inline u32 msg_max_pkt(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff) * 4; } static inline void msg_set_max_pkt(struct tipc_msg *m, u32 n) { msg_set_bits(m, 9, 16, 0xffff, (n / 4)); } static inline u32 msg_link_tolerance(struct tipc_msg *m) { return msg_bits(m, 9, 0, 0xffff); } static inline void msg_set_link_tolerance(struct tipc_msg *m, u32 n) { msg_set_bits(m, 9, 0, 0xffff, n); } static inline u16 msg_grp_bc_syncpt(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } static inline void msg_set_grp_bc_syncpt(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 16, 0xffff, n); } static inline u16 msg_grp_bc_acked(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } static inline void msg_set_grp_bc_acked(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 16, 0xffff, n); } static inline u16 msg_grp_remitted(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } static inline void msg_set_grp_remitted(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 16, 0xffff, n); } /* Word 10 */ static inline u16 msg_grp_evt(struct tipc_msg *m) { return msg_bits(m, 10, 0, 0x3); } static inline void msg_set_grp_evt(struct tipc_msg *m, int n) { msg_set_bits(m, 10, 0, 0x3, n); } static inline u16 msg_grp_bc_ack_req(struct tipc_msg *m) { return msg_bits(m, 10, 0, 0x1); } static inline void msg_set_grp_bc_ack_req(struct tipc_msg *m, bool n) { msg_set_bits(m, 10, 0, 0x1, n); } static inline u16 msg_grp_bc_seqno(struct tipc_msg *m) { return msg_bits(m, 10, 16, 0xffff); } static inline void msg_set_grp_bc_seqno(struct tipc_msg *m, u32 n) { msg_set_bits(m, 10, 16, 0xffff, n); } static inline bool msg_peer_link_is_up(struct tipc_msg *m) { if (likely(msg_user(m) != LINK_PROTOCOL)) return true; if (msg_type(m) == STATE_MSG) return true; return false; } static inline bool msg_peer_node_is_up(struct tipc_msg *m) { if (msg_peer_link_is_up(m)) return true; return msg_redundant_link(m); } static inline bool msg_is_reset(struct tipc_msg *hdr) { return (msg_user(hdr) == LINK_PROTOCOL) && (msg_type(hdr) == RESET_MSG); } /* Word 13 */ static inline void msg_set_peer_net_hash(struct tipc_msg *m, u32 n) { msg_set_word(m, 13, n); } static inline u32 msg_peer_net_hash(struct tipc_msg *m) { return msg_word(m, 13); } /* Word 14 */ static inline u32 msg_sugg_node_addr(struct tipc_msg *m) { return msg_word(m, 14); } static inline void msg_set_sugg_node_addr(struct tipc_msg *m, u32 n) { msg_set_word(m, 14, n); } static inline void msg_set_node_id(struct tipc_msg *hdr, u8 *id) { memcpy(msg_data(hdr), id, 16); } static inline u8 *msg_node_id(struct tipc_msg *hdr) { return (u8 *)msg_data(hdr); } struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp); bool tipc_msg_validate(struct sk_buff **_skb); bool tipc_msg_reverse(u32 own_addr, struct sk_buff **skb, int err); void tipc_skb_reject(struct net *net, int err, struct sk_buff *skb, struct sk_buff_head *xmitq); void tipc_msg_init(u32 own_addr, struct tipc_msg *m, u32 user, u32 type, u32 hsize, u32 destnode); struct sk_buff *tipc_msg_create(uint user, uint type, uint hdr_sz, uint data_sz, u32 dnode, u32 onode, u32 dport, u32 oport, int errcode); int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf); bool tipc_msg_try_bundle(struct sk_buff *tskb, struct sk_buff **skb, u32 mss, u32 dnode, bool *new_bundle); bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos); int tipc_msg_fragment(struct sk_buff *skb, const struct tipc_msg *hdr, int pktmax, struct sk_buff_head *frags); int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset, int dsz, int mtu, struct sk_buff_head *list); int tipc_msg_append(struct tipc_msg *hdr, struct msghdr *m, int dlen, int mss, struct sk_buff_head *txq); bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err); bool tipc_msg_assemble(struct sk_buff_head *list); bool tipc_msg_reassemble(struct sk_buff_head *list, struct sk_buff_head *rcvq); bool tipc_msg_pskb_copy(u32 dst, struct sk_buff_head *msg, struct sk_buff_head *cpy); bool __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno, struct sk_buff *skb); bool tipc_msg_skb_clone(struct sk_buff_head *msg, struct sk_buff_head *cpy); static inline u16 buf_seqno(struct sk_buff *skb) { return msg_seqno(buf_msg(skb)); } static inline int buf_roundup_len(struct sk_buff *skb) { return (skb->len / 1024 + 1) * 1024; } /* tipc_skb_peek(): peek and reserve first buffer in list * @list: list to be peeked in * Returns pointer to first buffer in list, if any */ static inline struct sk_buff *tipc_skb_peek(struct sk_buff_head *list, spinlock_t *lock) { struct sk_buff *skb; spin_lock_bh(lock); skb = skb_peek(list); if (skb) skb_get(skb); spin_unlock_bh(lock); return skb; } /* tipc_skb_peek_port(): find a destination port, ignoring all destinations * up to and including 'filter'. * Note: ignoring previously tried destinations minimizes the risk of * contention on the socket lock * @list: list to be peeked in * @filter: last destination to be ignored from search * Returns a destination port number, of applicable. */ static inline u32 tipc_skb_peek_port(struct sk_buff_head *list, u32 filter) { struct sk_buff *skb; u32 dport = 0; bool ignore = true; spin_lock_bh(&list->lock); skb_queue_walk(list, skb) { dport = msg_destport(buf_msg(skb)); if (!filter || skb_queue_is_last(list, skb)) break; if (dport == filter) ignore = false; else if (!ignore) break; } spin_unlock_bh(&list->lock); return dport; } /* tipc_skb_dequeue(): unlink first buffer with dest 'dport' from list * @list: list to be unlinked from * @dport: selection criteria for buffer to unlink */ static inline struct sk_buff *tipc_skb_dequeue(struct sk_buff_head *list, u32 dport) { struct sk_buff *_skb, *tmp, *skb = NULL; spin_lock_bh(&list->lock); skb_queue_walk_safe(list, _skb, tmp) { if (msg_destport(buf_msg(_skb)) == dport) { __skb_unlink(_skb, list); skb = _skb; break; } } spin_unlock_bh(&list->lock); return skb; } /* tipc_skb_queue_splice_tail - append an skb list to lock protected list * @list: the new list to append. Not lock protected * @head: target list. Lock protected. */ static inline void tipc_skb_queue_splice_tail(struct sk_buff_head *list, struct sk_buff_head *head) { spin_lock_bh(&head->lock); skb_queue_splice_tail(list, head); spin_unlock_bh(&head->lock); } /* tipc_skb_queue_splice_tail_init - merge two lock protected skb lists * @list: the new list to add. Lock protected. Will be reinitialized * @head: target list. Lock protected. */ static inline void tipc_skb_queue_splice_tail_init(struct sk_buff_head *list, struct sk_buff_head *head) { struct sk_buff_head tmp; __skb_queue_head_init(&tmp); spin_lock_bh(&list->lock); skb_queue_splice_tail_init(list, &tmp); spin_unlock_bh(&list->lock); tipc_skb_queue_splice_tail(&tmp, head); } /* __tipc_skb_dequeue() - dequeue the head skb according to expected seqno * @list: list to be dequeued from * @seqno: seqno of the expected msg * * returns skb dequeued from the list if its seqno is less than or equal to * the expected one, otherwise the skb is still hold * * Note: must be used with appropriate locks held only */ static inline struct sk_buff *__tipc_skb_dequeue(struct sk_buff_head *list, u16 seqno) { struct sk_buff *skb = skb_peek(list); if (skb && less_eq(buf_seqno(skb), seqno)) { __skb_unlink(skb, list); return skb; } return NULL; } #endif |
15 15 15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 | // SPDX-License-Identifier: GPL-2.0 /* * file.c - part of debugfs, a tiny little debug file system * * Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com> * Copyright (C) 2004 IBM Inc. * * debugfs is for people to use instead of /proc or /sys. * See Documentation/filesystems/ for more details. */ #include <linux/module.h> #include <linux/fs.h> #include <linux/seq_file.h> #include <linux/pagemap.h> #include <linux/debugfs.h> #include <linux/io.h> #include <linux/slab.h> #include <linux/atomic.h> #include <linux/device.h> #include <linux/pm_runtime.h> #include <linux/poll.h> #include <linux/security.h> #include "internal.h" struct poll_table_struct; static ssize_t default_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) { return 0; } static ssize_t default_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { return count; } const struct file_operations debugfs_noop_file_operations = { .read = default_read_file, .write = default_write_file, .open = simple_open, .llseek = noop_llseek, }; #define F_DENTRY(filp) ((filp)->f_path.dentry) const struct file_operations *debugfs_real_fops(const struct file *filp) { struct debugfs_fsdata *fsd = F_DENTRY(filp)->d_fsdata; if ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT) { /* * Urgh, we've been called w/o a protecting * debugfs_file_get(). */ WARN_ON(1); return NULL; } return fsd->real_fops; } EXPORT_SYMBOL_GPL(debugfs_real_fops); /** * debugfs_file_get - mark the beginning of file data access * @dentry: the dentry object whose data is being accessed. * * Up to a matching call to debugfs_file_put(), any successive call * into the file removing functions debugfs_remove() and * debugfs_remove_recursive() will block. Since associated private * file data may only get freed after a successful return of any of * the removal functions, you may safely access it after a successful * call to debugfs_file_get() without worrying about lifetime issues. * * If -%EIO is returned, the file has already been removed and thus, * it is not safe to access any of its data. If, on the other hand, * it is allowed to access the file data, zero is returned. */ int debugfs_file_get(struct dentry *dentry) { struct debugfs_fsdata *fsd; void *d_fsd; d_fsd = READ_ONCE(dentry->d_fsdata); if (!((unsigned long)d_fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)) { fsd = d_fsd; } else { fsd = kmalloc(sizeof(*fsd), GFP_KERNEL); if (!fsd) return -ENOMEM; fsd->real_fops = (void *)((unsigned long)d_fsd & ~DEBUGFS_FSDATA_IS_REAL_FOPS_BIT); refcount_set(&fsd->active_users, 1); init_completion(&fsd->active_users_drained); if (cmpxchg(&dentry->d_fsdata, d_fsd, fsd) != d_fsd) { kfree(fsd); fsd = READ_ONCE(dentry->d_fsdata); } } /* * In case of a successful cmpxchg() above, this check is * strictly necessary and must follow it, see the comment in * __debugfs_remove_file(). * OTOH, if the cmpxchg() hasn't been executed or wasn't * successful, this serves the purpose of not starving * removers. */ if (d_unlinked(dentry)) return -EIO; if (!refcount_inc_not_zero(&fsd->active_users)) return -EIO; return 0; } EXPORT_SYMBOL_GPL(debugfs_file_get); /** * debugfs_file_put - mark the end of file data access * @dentry: the dentry object formerly passed to * debugfs_file_get(). * * Allow any ongoing concurrent call into debugfs_remove() or * debugfs_remove_recursive() blocked by a former call to * debugfs_file_get() to proceed and return to its caller. */ void debugfs_file_put(struct dentry *dentry) { struct debugfs_fsdata *fsd = READ_ONCE(dentry->d_fsdata); if (refcount_dec_and_test(&fsd->active_users)) complete(&fsd->active_users_drained); } EXPORT_SYMBOL_GPL(debugfs_file_put); /* * Only permit access to world-readable files when the kernel is locked down. * We also need to exclude any file that has ways to write or alter it as root * can bypass the permissions check. */ static int debugfs_locked_down(struct inode *inode, struct file *filp, const struct file_operations *real_fops) { if ((inode->i_mode & 07777 & ~0444) == 0 && !(filp->f_mode & FMODE_WRITE) && !real_fops->unlocked_ioctl && !real_fops->compat_ioctl && !real_fops->mmap) return 0; if (security_locked_down(LOCKDOWN_DEBUGFS)) return -EPERM; return 0; } static int open_proxy_open(struct inode *inode, struct file *filp) { struct dentry *dentry = F_DENTRY(filp); const struct file_operations *real_fops = NULL; int r; r = debugfs_file_get(dentry); if (r) return r == -EIO ? -ENOENT : r; real_fops = debugfs_real_fops(filp); r = debugfs_locked_down(inode, filp, real_fops); if (r) goto out; if (!fops_get(real_fops)) { #ifdef CONFIG_MODULES if (real_fops->owner && real_fops->owner->state == MODULE_STATE_GOING) { r = -ENXIO; goto out; } #endif /* Huh? Module did not clean up after itself at exit? */ WARN(1, "debugfs file owner did not clean up at exit: %pd", dentry); r = -ENXIO; goto out; } replace_fops(filp, real_fops); if (real_fops->open) r = real_fops->open(inode, filp); out: debugfs_file_put(dentry); return r; } const struct file_operations debugfs_open_proxy_file_operations = { .open = open_proxy_open, }; #define PROTO(args...) args #define ARGS(args...) args #define FULL_PROXY_FUNC(name, ret_type, filp, proto, args) \ static ret_type full_proxy_ ## name(proto) \ { \ struct dentry *dentry = F_DENTRY(filp); \ const struct file_operations *real_fops; \ ret_type r; \ \ r = debugfs_file_get(dentry); \ if (unlikely(r)) \ return r; \ real_fops = debugfs_real_fops(filp); \ r = real_fops->name(args); \ debugfs_file_put(dentry); \ return r; \ } FULL_PROXY_FUNC(llseek, loff_t, filp, PROTO(struct file *filp, loff_t offset, int whence), ARGS(filp, offset, whence)); FULL_PROXY_FUNC(read, ssize_t, filp, PROTO(struct file *filp, char __user *buf, size_t size, loff_t *ppos), ARGS(filp, buf, size, ppos)); FULL_PROXY_FUNC(write, ssize_t, filp, PROTO(struct file *filp, const char __user *buf, size_t size, loff_t *ppos), ARGS(filp, buf, size, ppos)); FULL_PROXY_FUNC(unlocked_ioctl, long, filp, PROTO(struct file *filp, unsigned int cmd, unsigned long arg), ARGS(filp, cmd, arg)); static __poll_t full_proxy_poll(struct file *filp, struct poll_table_struct *wait) { struct dentry *dentry = F_DENTRY(filp); __poll_t r = 0; const struct file_operations *real_fops; if (debugfs_file_get(dentry)) return EPOLLHUP; real_fops = debugfs_real_fops(filp); r = real_fops->poll(filp, wait); debugfs_file_put(dentry); return r; } static int full_proxy_release(struct inode *inode, struct file *filp) { const struct dentry *dentry = F_DENTRY(filp); const struct file_operations *real_fops = debugfs_real_fops(filp); const struct file_operations *proxy_fops = filp->f_op; int r = 0; /* * We must not protect this against removal races here: the * original releaser should be called unconditionally in order * not to leak any resources. Releasers must not assume that * ->i_private is still being meaningful here. */ if (real_fops->release) r = real_fops->release(inode, filp); replace_fops(filp, d_inode(dentry)->i_fop); kfree(proxy_fops); fops_put(real_fops); return r; } static void __full_proxy_fops_init(struct file_operations *proxy_fops, const struct file_operations *real_fops) { proxy_fops->release = full_proxy_release; if (real_fops->llseek) proxy_fops->llseek = full_proxy_llseek; if (real_fops->read) proxy_fops->read = full_proxy_read; if (real_fops->write) proxy_fops->write = full_proxy_write; if (real_fops->poll) proxy_fops->poll = full_proxy_poll; if (real_fops->unlocked_ioctl) proxy_fops->unlocked_ioctl = full_proxy_unlocked_ioctl; } static int full_proxy_open(struct inode *inode, struct file *filp) { struct dentry *dentry = F_DENTRY(filp); const struct file_operations *real_fops = NULL; struct file_operations *proxy_fops = NULL; int r; r = debugfs_file_get(dentry); if (r) return r == -EIO ? -ENOENT : r; real_fops = debugfs_real_fops(filp); r = debugfs_locked_down(inode, filp, real_fops); if (r) goto out; if (!fops_get(real_fops)) { #ifdef CONFIG_MODULES if (real_fops->owner && real_fops->owner->state == MODULE_STATE_GOING) { r = -ENXIO; goto out; } #endif /* Huh? Module did not cleanup after itself at exit? */ WARN(1, "debugfs file owner did not clean up at exit: %pd", dentry); r = -ENXIO; goto out; } proxy_fops = kzalloc(sizeof(*proxy_fops), GFP_KERNEL); if (!proxy_fops) { r = -ENOMEM; goto free_proxy; } __full_proxy_fops_init(proxy_fops, real_fops); replace_fops(filp, proxy_fops); if (real_fops->open) { r = real_fops->open(inode, filp); if (r) { replace_fops(filp, d_inode(dentry)->i_fop); goto free_proxy; } else if (filp->f_op != proxy_fops) { /* No protection against file removal anymore. */ WARN(1, "debugfs file owner replaced proxy fops: %pd", dentry); goto free_proxy; } } goto out; free_proxy: kfree(proxy_fops); fops_put(real_fops); out: debugfs_file_put(dentry); return r; } const struct file_operations debugfs_full_proxy_file_operations = { .open = full_proxy_open, }; ssize_t debugfs_attr_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) { struct dentry *dentry = F_DENTRY(file); ssize_t ret; ret = debugfs_file_get(dentry); if (unlikely(ret)) return ret; ret = simple_attr_read(file, buf, len, ppos); debugfs_file_put(dentry); return ret; } EXPORT_SYMBOL_GPL(debugfs_attr_read); static ssize_t debugfs_attr_write_xsigned(struct file *file, const char __user *buf, size_t len, loff_t *ppos, bool is_signed) { struct dentry *dentry = F_DENTRY(file); ssize_t ret; ret = debugfs_file_get(dentry); if (unlikely(ret)) return ret; if (is_signed) ret = simple_attr_write_signed(file, buf, len, ppos); else ret = simple_attr_write(file, buf, len, ppos); debugfs_file_put(dentry); return ret; } ssize_t debugfs_attr_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos) { return debugfs_attr_write_xsigned(file, buf, len, ppos, false); } EXPORT_SYMBOL_GPL(debugfs_attr_write); ssize_t debugfs_attr_write_signed(struct file *file, const char __user *buf, size_t len, loff_t *ppos) { return debugfs_attr_write_xsigned(file, buf, len, ppos, true); } EXPORT_SYMBOL_GPL(debugfs_attr_write_signed); static struct dentry *debugfs_create_mode_unsafe(const char *name, umode_t mode, struct dentry *parent, void *value, const struct file_operations *fops, const struct file_operations *fops_ro, const struct file_operations *fops_wo) { /* if there are no write bits set, make read only */ if (!(mode & S_IWUGO)) return debugfs_create_file_unsafe(name, mode, parent, value, fops_ro); /* if there are no read bits set, make write only */ if (!(mode & S_IRUGO)) return debugfs_create_file_unsafe(name, mode, parent, value, fops_wo); return debugfs_create_file_unsafe(name, mode, parent, value, fops); } static int debugfs_u8_set(void *data, u64 val) { *(u8 *)data = val; return 0; } static int debugfs_u8_get(void *data, u64 *val) { *val = *(u8 *)data; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(fops_u8, debugfs_u8_get, debugfs_u8_set, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_u8_ro, debugfs_u8_get, NULL, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n"); /** * debugfs_create_u8 - create a debugfs file that is used to read and write an unsigned 8-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. * * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. */ void debugfs_create_u8(const char *name, umode_t mode, struct dentry *parent, u8 *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u8, &fops_u8_ro, &fops_u8_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u8); static int debugfs_u16_set(void *data, u64 val) { *(u16 *)data = val; return 0; } static int debugfs_u16_get(void *data, u64 *val) { *val = *(u16 *)data; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(fops_u16, debugfs_u16_get, debugfs_u16_set, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_u16_ro, debugfs_u16_get, NULL, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n"); /** * debugfs_create_u16 - create a debugfs file that is used to read and write an unsigned 16-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. * * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. */ void debugfs_create_u16(const char *name, umode_t mode, struct dentry *parent, u16 *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u16, &fops_u16_ro, &fops_u16_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u16); static int debugfs_u32_set(void *data, u64 val) { *(u32 *)data = val; return 0; } static int debugfs_u32_get(void *data, u64 *val) { *val = *(u32 *)data; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(fops_u32, debugfs_u32_get, debugfs_u32_set, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_u32_ro, debugfs_u32_get, NULL, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n"); /** * debugfs_create_u32 - create a debugfs file that is used to read and write an unsigned 32-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. * * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. */ void debugfs_create_u32(const char *name, umode_t mode, struct dentry *parent, u32 *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u32, &fops_u32_ro, &fops_u32_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u32); static int debugfs_u64_set(void *data, u64 val) { *(u64 *)data = val; return 0; } static int debugfs_u64_get(void *data, u64 *val) { *val = *(u64 *)data; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(fops_u64, debugfs_u64_get, debugfs_u64_set, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_u64_ro, debugfs_u64_get, NULL, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n"); /** * debugfs_create_u64 - create a debugfs file that is used to read and write an unsigned 64-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. * * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. */ void debugfs_create_u64(const char *name, umode_t mode, struct dentry *parent, u64 *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u64, &fops_u64_ro, &fops_u64_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u64); static int debugfs_ulong_set(void *data, u64 val) { *(unsigned long *)data = val; return 0; } static int debugfs_ulong_get(void *data, u64 *val) { *val = *(unsigned long *)data; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(fops_ulong, debugfs_ulong_get, debugfs_ulong_set, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_ulong_ro, debugfs_ulong_get, NULL, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_ulong_wo, NULL, debugfs_ulong_set, "%llu\n"); /** * debugfs_create_ulong - create a debugfs file that is used to read and write * an unsigned long value. * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. * * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. */ void debugfs_create_ulong(const char *name, umode_t mode, struct dentry *parent, unsigned long *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_ulong, &fops_ulong_ro, &fops_ulong_wo); } EXPORT_SYMBOL_GPL(debugfs_create_ulong); DEFINE_DEBUGFS_ATTRIBUTE(fops_x8, debugfs_u8_get, debugfs_u8_set, "0x%02llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x8_ro, debugfs_u8_get, NULL, "0x%02llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x8_wo, NULL, debugfs_u8_set, "0x%02llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x16, debugfs_u16_get, debugfs_u16_set, "0x%04llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x16_ro, debugfs_u16_get, NULL, "0x%04llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x16_wo, NULL, debugfs_u16_set, "0x%04llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, "0x%08llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x64_ro, debugfs_u64_get, NULL, "0x%016llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x64_wo, NULL, debugfs_u64_set, "0x%016llx\n"); /* * debugfs_create_x{8,16,32,64} - create a debugfs file that is used to read and write an unsigned {8,16,32,64}-bit value * * These functions are exactly the same as the above functions (but use a hex * output for the decimal challenged). For details look at the above unsigned * decimal functions. */ /** * debugfs_create_x8 - create a debugfs file that is used to read and write an unsigned 8-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. */ void debugfs_create_x8(const char *name, umode_t mode, struct dentry *parent, u8 *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x8, &fops_x8_ro, &fops_x8_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x8); /** * debugfs_create_x16 - create a debugfs file that is used to read and write an unsigned 16-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. */ void debugfs_create_x16(const char *name, umode_t mode, struct dentry *parent, u16 *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x16, &fops_x16_ro, &fops_x16_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x16); /** * debugfs_create_x32 - create a debugfs file that is used to read and write an unsigned 32-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. */ void debugfs_create_x32(const char *name, umode_t mode, struct dentry *parent, u32 *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x32, &fops_x32_ro, &fops_x32_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x32); /** * debugfs_create_x64 - create a debugfs file that is used to read and write an unsigned 64-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. */ void debugfs_create_x64(const char *name, umode_t mode, struct dentry *parent, u64 *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x64, &fops_x64_ro, &fops_x64_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x64); static int debugfs_size_t_set(void *data, u64 val) { *(size_t *)data = val; return 0; } static int debugfs_size_t_get(void *data, u64 *val) { *val = *(size_t *)data; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set, "%llu\n"); /* %llu and %zu are more or less the same */ DEFINE_DEBUGFS_ATTRIBUTE(fops_size_t_ro, debugfs_size_t_get, NULL, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_size_t_wo, NULL, debugfs_size_t_set, "%llu\n"); /** * debugfs_create_size_t - create a debugfs file that is used to read and write an size_t value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. */ void debugfs_create_size_t(const char *name, umode_t mode, struct dentry *parent, size_t *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_size_t, &fops_size_t_ro, &fops_size_t_wo); } EXPORT_SYMBOL_GPL(debugfs_create_size_t); static int debugfs_atomic_t_set(void *data, u64 val) { atomic_set((atomic_t *)data, val); return 0; } static int debugfs_atomic_t_get(void *data, u64 *val) { *val = atomic_read((atomic_t *)data); return 0; } DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t, debugfs_atomic_t_get, debugfs_atomic_t_set, "%lld\n"); DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t_ro, debugfs_atomic_t_get, NULL, "%lld\n"); DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n"); /** * debugfs_create_atomic_t - create a debugfs file that is used to read and * write an atomic_t value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. */ void debugfs_create_atomic_t(const char *name, umode_t mode, struct dentry *parent, atomic_t *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_atomic_t, &fops_atomic_t_ro, &fops_atomic_t_wo); } EXPORT_SYMBOL_GPL(debugfs_create_atomic_t); ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { char buf[2]; bool val; int r; struct dentry *dentry = F_DENTRY(file); r = debugfs_file_get(dentry); if (unlikely(r)) return r; val = *(bool *)file->private_data; debugfs_file_put(dentry); if (val) buf[0] = 'Y'; else buf[0] = 'N'; buf[1] = '\n'; return simple_read_from_buffer(user_buf, count, ppos, buf, 2); } EXPORT_SYMBOL_GPL(debugfs_read_file_bool); ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { bool bv; int r; bool *val = file->private_data; struct dentry *dentry = F_DENTRY(file); r = kstrtobool_from_user(user_buf, count, &bv); if (!r) { r = debugfs_file_get(dentry); if (unlikely(r)) return r; *val = bv; debugfs_file_put(dentry); } return count; } EXPORT_SYMBOL_GPL(debugfs_write_file_bool); static const struct file_operations fops_bool = { .read = debugfs_read_file_bool, .write = debugfs_write_file_bool, .open = simple_open, .llseek = default_llseek, }; static const struct file_operations fops_bool_ro = { .read = debugfs_read_file_bool, .open = simple_open, .llseek = default_llseek, }; static const struct file_operations fops_bool_wo = { .write = debugfs_write_file_bool, .open = simple_open, .llseek = default_llseek, }; /** * debugfs_create_bool - create a debugfs file that is used to read and write a boolean value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. * * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. */ void debugfs_create_bool(const char *name, umode_t mode, struct dentry *parent, bool *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_bool, &fops_bool_ro, &fops_bool_wo); } EXPORT_SYMBOL_GPL(debugfs_create_bool); ssize_t debugfs_read_file_str(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct dentry *dentry = F_DENTRY(file); char *str, *copy = NULL; int copy_len, len; ssize_t ret; ret = debugfs_file_get(dentry); if (unlikely(ret)) return ret; str = *(char **)file->private_data; len = strlen(str) + 1; copy = kmalloc(len, GFP_KERNEL); if (!copy) { debugfs_file_put(dentry); return -ENOMEM; } copy_len = strscpy(copy, str, len); debugfs_file_put(dentry); if (copy_len < 0) { kfree(copy); return copy_len; } copy[copy_len] = '\n'; ret = simple_read_from_buffer(user_buf, count, ppos, copy, len); kfree(copy); return ret; } EXPORT_SYMBOL_GPL(debugfs_create_str); static ssize_t debugfs_write_file_str(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct dentry *dentry = F_DENTRY(file); char *old, *new = NULL; int pos = *ppos; int r; r = debugfs_file_get(dentry); if (unlikely(r)) return r; old = *(char **)file->private_data; /* only allow strict concatenation */ r = -EINVAL; if (pos && pos != strlen(old)) goto error; r = -E2BIG; if (pos + count + 1 > PAGE_SIZE) goto error; r = -ENOMEM; new = kmalloc(pos + count + 1, GFP_KERNEL); if (!new) goto error; if (pos) memcpy(new, old, pos); r = -EFAULT; if (copy_from_user(new + pos, user_buf, count)) goto error; new[pos + count] = '\0'; strim(new); rcu_assign_pointer(*(char **)file->private_data, new); synchronize_rcu(); kfree(old); debugfs_file_put(dentry); return count; error: kfree(new); debugfs_file_put(dentry); return r; } static const struct file_operations fops_str = { .read = debugfs_read_file_str, .write = debugfs_write_file_str, .open = simple_open, .llseek = default_llseek, }; static const struct file_operations fops_str_ro = { .read = debugfs_read_file_str, .open = simple_open, .llseek = default_llseek, }; static const struct file_operations fops_str_wo = { .write = debugfs_write_file_str, .open = simple_open, .llseek = default_llseek, }; /** * debugfs_create_str - create a debugfs file that is used to read and write a string value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. * * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. */ void debugfs_create_str(const char *name, umode_t mode, struct dentry *parent, char **value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_str, &fops_str_ro, &fops_str_wo); } static ssize_t read_file_blob(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct debugfs_blob_wrapper *blob = file->private_data; struct dentry *dentry = F_DENTRY(file); ssize_t r; r = debugfs_file_get(dentry); if (unlikely(r)) return r; r = simple_read_from_buffer(user_buf, count, ppos, blob->data, blob->size); debugfs_file_put(dentry); return r; } static const struct file_operations fops_blob = { .read = read_file_blob, .open = simple_open, .llseek = default_llseek, }; /** * debugfs_create_blob - create a debugfs file that is used to read a binary blob * @name: a pointer to a string containing the name of the file to create. * @mode: the read permission that the file should have (other permissions are * masked out) * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @blob: a pointer to a struct debugfs_blob_wrapper which contains a pointer * to the blob data and the size of the data. * * This function creates a file in debugfs with the given name that exports * @blob->data as a binary blob. If the @mode variable is so set it can be * read from. Writing is not supported. * * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be * returned. * * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will * be returned. */ struct dentry *debugfs_create_blob(const char *name, umode_t mode, struct dentry *parent, struct debugfs_blob_wrapper *blob) { return debugfs_create_file_unsafe(name, mode & 0444, parent, blob, &fops_blob); } EXPORT_SYMBOL_GPL(debugfs_create_blob); static size_t u32_format_array(char *buf, size_t bufsize, u32 *array, int array_size) { size_t ret = 0; while (--array_size >= 0) { size_t len; char term = array_size ? ' ' : '\n'; len = snprintf(buf, bufsize, "%u%c", *array++, term); ret += len; buf += len; bufsize -= len; } return ret; } static int u32_array_open(struct inode *inode, struct file *file) { struct debugfs_u32_array *data = inode->i_private; int size, elements = data->n_elements; char *buf; /* * Max size: * - 10 digits + ' '/'\n' = 11 bytes per number * - terminating NUL character */ size = elements*11; buf = kmalloc(size+1, GFP_KERNEL); if (!buf) return -ENOMEM; buf[size] = 0; file->private_data = buf; u32_format_array(buf, size, data->array, data->n_elements); return nonseekable_open(inode, file); } static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) { size_t size = strlen(file->private_data); return simple_read_from_buffer(buf, len, ppos, file->private_data, size); } static int u32_array_release(struct inode *inode, struct file *file) { kfree(file->private_data); return 0; } static const struct file_operations u32_array_fops = { .owner = THIS_MODULE, .open = u32_array_open, .release = u32_array_release, .read = u32_array_read, .llseek = no_llseek, }; /** * debugfs_create_u32_array - create a debugfs file that is used to read u32 * array. * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have. * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @array: wrapper struct containing data pointer and size of the array. * * This function creates a file in debugfs with the given name that exports * @array as data. If the @mode variable is so set it can be read from. * Writing is not supported. Seek within the file is also not supported. * Once array is created its size can not be changed. */ void debugfs_create_u32_array(const char *name, umode_t mode, struct dentry *parent, struct debugfs_u32_array *array) { debugfs_create_file_unsafe(name, mode, parent, array, &u32_array_fops); } EXPORT_SYMBOL_GPL(debugfs_create_u32_array); #ifdef CONFIG_HAS_IOMEM /* * The regset32 stuff is used to print 32-bit registers using the * seq_file utilities. We offer printing a register set in an already-opened * sequential file or create a debugfs file that only prints a regset32. */ /** * debugfs_print_regs32 - use seq_print to describe a set of registers * @s: the seq_file structure being used to generate output * @regs: an array if struct debugfs_reg32 structures * @nregs: the length of the above array * @base: the base address to be used in reading the registers * @prefix: a string to be prefixed to every output line * * This function outputs a text block describing the current values of * some 32-bit hardware registers. It is meant to be used within debugfs * files based on seq_file that need to show registers, intermixed with other * information. The prefix argument may be used to specify a leading string, * because some peripherals have several blocks of identical registers, * for example configuration of dma channels */ void debugfs_print_regs32(struct seq_file *s, const struct debugfs_reg32 *regs, int nregs, void __iomem *base, char *prefix) { int i; for (i = 0; i < nregs; i++, regs++) { if (prefix) seq_printf(s, "%s", prefix); seq_printf(s, "%s = 0x%08x\n", regs->name, readl(base + regs->offset)); if (seq_has_overflowed(s)) break; } } EXPORT_SYMBOL_GPL(debugfs_print_regs32); static int debugfs_regset32_show(struct seq_file *s, void *data) { struct debugfs_regset32 *regset = s->private; if (regset->dev) pm_runtime_get_sync(regset->dev); debugfs_print_regs32(s, regset->regs, regset->nregs, regset->base, ""); if (regset->dev) pm_runtime_put(regset->dev); return 0; } DEFINE_SHOW_ATTRIBUTE(debugfs_regset32); /** * debugfs_create_regset32 - create a debugfs file that returns register values * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @regset: a pointer to a struct debugfs_regset32, which contains a pointer * to an array of register definitions, the array size and the base * address where the register bank is to be found. * * This function creates a file in debugfs with the given name that reports * the names and values of a set of 32-bit registers. If the @mode variable * is so set it can be read from. Writing is not supported. */ void debugfs_create_regset32(const char *name, umode_t mode, struct dentry *parent, struct debugfs_regset32 *regset) { debugfs_create_file(name, mode, parent, regset, &debugfs_regset32_fops); } EXPORT_SYMBOL_GPL(debugfs_create_regset32); #endif /* CONFIG_HAS_IOMEM */ struct debugfs_devm_entry { int (*read)(struct seq_file *seq, void *data); struct device *dev; }; static int debugfs_devm_entry_open(struct inode *inode, struct file *f) { struct debugfs_devm_entry *entry = inode->i_private; return single_open(f, entry->read, entry->dev); } static const struct file_operations debugfs_devm_entry_ops = { .owner = THIS_MODULE, .open = debugfs_devm_entry_open, .release = single_release, .read = seq_read, .llseek = seq_lseek }; /** * debugfs_create_devm_seqfile - create a debugfs file that is bound to device. * * @dev: device related to this debugfs file. * @name: name of the debugfs file. * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @read_fn: function pointer called to print the seq_file content. */ void debugfs_create_devm_seqfile(struct device *dev, const char *name, struct dentry *parent, int (*read_fn)(struct seq_file *s, void *data)) { struct debugfs_devm_entry *entry; if (IS_ERR(parent)) return; entry = devm_kzalloc(dev, sizeof(*entry), GFP_KERNEL); if (!entry) return; entry->read = read_fn; entry->dev = dev; debugfs_create_file(name, S_IRUGO, parent, entry, &debugfs_devm_entry_ops); } EXPORT_SYMBOL_GPL(debugfs_create_devm_seqfile); |
1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 | // SPDX-License-Identifier: GPL-2.0-or-later /* * CALIPSO - Common Architecture Label IPv6 Security Option * * This is an implementation of the CALIPSO protocol as specified in * RFC 5570. * * Authors: Paul Moore <paul.moore@hp.com> * Huw Davies <huw@codeweavers.com> */ /* (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008 * (c) Copyright Huw Davies <huw@codeweavers.com>, 2015 */ #include <linux/init.h> #include <linux/types.h> #include <linux/rcupdate.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/string.h> #include <linux/jhash.h> #include <linux/audit.h> #include <linux/slab.h> #include <net/ip.h> #include <net/icmp.h> #include <net/tcp.h> #include <net/netlabel.h> #include <net/calipso.h> #include <linux/atomic.h> #include <linux/bug.h> #include <asm/unaligned.h> #include <linux/crc-ccitt.h> /* Maximium size of the calipso option including * the two-byte TLV header. */ #define CALIPSO_OPT_LEN_MAX (2 + 252) /* Size of the minimum calipso option including * the two-byte TLV header. */ #define CALIPSO_HDR_LEN (2 + 8) /* Maximium size of the calipso option including * the two-byte TLV header and upto 3 bytes of * leading pad and 7 bytes of trailing pad. */ #define CALIPSO_OPT_LEN_MAX_WITH_PAD (3 + CALIPSO_OPT_LEN_MAX + 7) /* Maximium size of u32 aligned buffer required to hold calipso * option. Max of 3 initial pad bytes starting from buffer + 3. * i.e. the worst case is when the previous tlv finishes on 4n + 3. */ #define CALIPSO_MAX_BUFFER (6 + CALIPSO_OPT_LEN_MAX) /* List of available DOI definitions */ static DEFINE_SPINLOCK(calipso_doi_list_lock); static LIST_HEAD(calipso_doi_list); /* Label mapping cache */ int calipso_cache_enabled = 1; int calipso_cache_bucketsize = 10; #define CALIPSO_CACHE_BUCKETBITS 7 #define CALIPSO_CACHE_BUCKETS BIT(CALIPSO_CACHE_BUCKETBITS) #define CALIPSO_CACHE_REORDERLIMIT 10 struct calipso_map_cache_bkt { spinlock_t lock; u32 size; struct list_head list; }; struct calipso_map_cache_entry { u32 hash; unsigned char *key; size_t key_len; struct netlbl_lsm_cache *lsm_data; u32 activity; struct list_head list; }; static struct calipso_map_cache_bkt *calipso_cache; static void calipso_cache_invalidate(void); static void calipso_doi_putdef(struct calipso_doi *doi_def); /* Label Mapping Cache Functions */ /** * calipso_cache_entry_free - Frees a cache entry * @entry: the entry to free * * Description: * This function frees the memory associated with a cache entry including the * LSM cache data if there are no longer any users, i.e. reference count == 0. * */ static void calipso_cache_entry_free(struct calipso_map_cache_entry *entry) { if (entry->lsm_data) netlbl_secattr_cache_free(entry->lsm_data); kfree(entry->key); kfree(entry); } /** * calipso_map_cache_hash - Hashing function for the CALIPSO cache * @key: the hash key * @key_len: the length of the key in bytes * * Description: * The CALIPSO tag hashing function. Returns a 32-bit hash value. * */ static u32 calipso_map_cache_hash(const unsigned char *key, u32 key_len) { return jhash(key, key_len, 0); } /** * calipso_cache_init - Initialize the CALIPSO cache * * Description: * Initializes the CALIPSO label mapping cache, this function should be called * before any of the other functions defined in this file. Returns zero on * success, negative values on error. * */ static int __init calipso_cache_init(void) { u32 iter; calipso_cache = kcalloc(CALIPSO_CACHE_BUCKETS, sizeof(struct calipso_map_cache_bkt), GFP_KERNEL); if (!calipso_cache) return -ENOMEM; for (iter = 0; iter < CALIPSO_CACHE_BUCKETS; iter++) { spin_lock_init(&calipso_cache[iter].lock); calipso_cache[iter].size = 0; INIT_LIST_HEAD(&calipso_cache[iter].list); } return 0; } /** * calipso_cache_invalidate - Invalidates the current CALIPSO cache * * Description: * Invalidates and frees any entries in the CALIPSO cache. Returns zero on * success and negative values on failure. * */ static void calipso_cache_invalidate(void) { struct calipso_map_cache_entry *entry, *tmp_entry; u32 iter; for (iter = 0; iter < CALIPSO_CACHE_BUCKETS; iter++) { spin_lock_bh(&calipso_cache[iter].lock); list_for_each_entry_safe(entry, tmp_entry, &calipso_cache[iter].list, list) { list_del(&entry->list); calipso_cache_entry_free(entry); } calipso_cache[iter].size = 0; spin_unlock_bh(&calipso_cache[iter].lock); } } /** * calipso_cache_check - Check the CALIPSO cache for a label mapping * @key: the buffer to check * @key_len: buffer length in bytes * @secattr: the security attribute struct to use * * Description: * This function checks the cache to see if a label mapping already exists for * the given key. If there is a match then the cache is adjusted and the * @secattr struct is populated with the correct LSM security attributes. The * cache is adjusted in the following manner if the entry is not already the * first in the cache bucket: * * 1. The cache entry's activity counter is incremented * 2. The previous (higher ranking) entry's activity counter is decremented * 3. If the difference between the two activity counters is geater than * CALIPSO_CACHE_REORDERLIMIT the two entries are swapped * * Returns zero on success, -ENOENT for a cache miss, and other negative values * on error. * */ static int calipso_cache_check(const unsigned char *key, u32 key_len, struct netlbl_lsm_secattr *secattr) { u32 bkt; struct calipso_map_cache_entry *entry; struct calipso_map_cache_entry *prev_entry = NULL; u32 hash; if (!calipso_cache_enabled) return -ENOENT; hash = calipso_map_cache_hash(key, key_len); bkt = hash & (CALIPSO_CACHE_BUCKETS - 1); spin_lock_bh(&calipso_cache[bkt].lock); list_for_each_entry(entry, &calipso_cache[bkt].list, list) { if (entry->hash == hash && entry->key_len == key_len && memcmp(entry->key, key, key_len) == 0) { entry->activity += 1; refcount_inc(&entry->lsm_data->refcount); secattr->cache = entry->lsm_data; secattr->flags |= NETLBL_SECATTR_CACHE; secattr->type = NETLBL_NLTYPE_CALIPSO; if (!prev_entry) { spin_unlock_bh(&calipso_cache[bkt].lock); return 0; } if (prev_entry->activity > 0) prev_entry->activity -= 1; if (entry->activity > prev_entry->activity && entry->activity - prev_entry->activity > CALIPSO_CACHE_REORDERLIMIT) { __list_del(entry->list.prev, entry->list.next); __list_add(&entry->list, prev_entry->list.prev, &prev_entry->list); } spin_unlock_bh(&calipso_cache[bkt].lock); return 0; } prev_entry = entry; } spin_unlock_bh(&calipso_cache[bkt].lock); return -ENOENT; } /** * calipso_cache_add - Add an entry to the CALIPSO cache * @calipso_ptr: the CALIPSO option * @secattr: the packet's security attributes * * Description: * Add a new entry into the CALIPSO label mapping cache. Add the new entry to * head of the cache bucket's list, if the cache bucket is out of room remove * the last entry in the list first. It is important to note that there is * currently no checking for duplicate keys. Returns zero on success, * negative values on failure. The key stored starts at calipso_ptr + 2, * i.e. the type and length bytes are not stored, this corresponds to * calipso_ptr[1] bytes of data. * */ static int calipso_cache_add(const unsigned char *calipso_ptr, const struct netlbl_lsm_secattr *secattr) { int ret_val = -EPERM; u32 bkt; struct calipso_map_cache_entry *entry = NULL; struct calipso_map_cache_entry *old_entry = NULL; u32 calipso_ptr_len; if (!calipso_cache_enabled || calipso_cache_bucketsize <= 0) return 0; calipso_ptr_len = calipso_ptr[1]; entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) return -ENOMEM; entry->key = kmemdup(calipso_ptr + 2, calipso_ptr_len, GFP_ATOMIC); if (!entry->key) { ret_val = -ENOMEM; goto cache_add_failure; } entry->key_len = calipso_ptr_len; entry->hash = calipso_map_cache_hash(calipso_ptr, calipso_ptr_len); refcount_inc(&secattr->cache->refcount); entry->lsm_data = secattr->cache; bkt = entry->hash & (CALIPSO_CACHE_BUCKETS - 1); spin_lock_bh(&calipso_cache[bkt].lock); if (calipso_cache[bkt].size < calipso_cache_bucketsize) { list_add(&entry->list, &calipso_cache[bkt].list); calipso_cache[bkt].size += 1; } else { old_entry = list_entry(calipso_cache[bkt].list.prev, struct calipso_map_cache_entry, list); list_del(&old_entry->list); list_add(&entry->list, &calipso_cache[bkt].list); calipso_cache_entry_free(old_entry); } spin_unlock_bh(&calipso_cache[bkt].lock); return 0; cache_add_failure: if (entry) calipso_cache_entry_free(entry); return ret_val; } /* DOI List Functions */ /** * calipso_doi_search - Searches for a DOI definition * @doi: the DOI to search for * * Description: * Search the DOI definition list for a DOI definition with a DOI value that * matches @doi. The caller is responsible for calling rcu_read_[un]lock(). * Returns a pointer to the DOI definition on success and NULL on failure. */ static struct calipso_doi *calipso_doi_search(u32 doi) { struct calipso_doi *iter; list_for_each_entry_rcu(iter, &calipso_doi_list, list) if (iter->doi == doi && refcount_read(&iter->refcount)) return iter; return NULL; } /** * calipso_doi_add - Add a new DOI to the CALIPSO protocol engine * @doi_def: the DOI structure * @audit_info: NetLabel audit information * * Description: * The caller defines a new DOI for use by the CALIPSO engine and calls this * function to add it to the list of acceptable domains. The caller must * ensure that the mapping table specified in @doi_def->map meets all of the * requirements of the mapping type (see calipso.h for details). Returns * zero on success and non-zero on failure. * */ static int calipso_doi_add(struct calipso_doi *doi_def, struct netlbl_audit *audit_info) { int ret_val = -EINVAL; u32 doi; u32 doi_type; struct audit_buffer *audit_buf; doi = doi_def->doi; doi_type = doi_def->type; if (doi_def->doi == CALIPSO_DOI_UNKNOWN) goto doi_add_return; refcount_set(&doi_def->refcount, 1); spin_lock(&calipso_doi_list_lock); if (calipso_doi_search(doi_def->doi)) { spin_unlock(&calipso_doi_list_lock); ret_val = -EEXIST; goto doi_add_return; } list_add_tail_rcu(&doi_def->list, &calipso_doi_list); spin_unlock(&calipso_doi_list_lock); ret_val = 0; doi_add_return: audit_buf = netlbl_audit_start(AUDIT_MAC_CALIPSO_ADD, audit_info); if (audit_buf) { const char *type_str; switch (doi_type) { case CALIPSO_MAP_PASS: type_str = "pass"; break; default: type_str = "(unknown)"; } audit_log_format(audit_buf, " calipso_doi=%u calipso_type=%s res=%u", doi, type_str, ret_val == 0 ? 1 : 0); audit_log_end(audit_buf); } return ret_val; } /** * calipso_doi_free - Frees a DOI definition * @doi_def: the DOI definition * * Description: * This function frees all of the memory associated with a DOI definition. * */ static void calipso_doi_free(struct calipso_doi *doi_def) { kfree(doi_def); } /** * calipso_doi_free_rcu - Frees a DOI definition via the RCU pointer * @entry: the entry's RCU field * * Description: * This function is designed to be used as a callback to the call_rcu() * function so that the memory allocated to the DOI definition can be released * safely. * */ static void calipso_doi_free_rcu(struct rcu_head *entry) { struct calipso_doi *doi_def; doi_def = container_of(entry, struct calipso_doi, rcu); calipso_doi_free(doi_def); } /** * calipso_doi_remove - Remove an existing DOI from the CALIPSO protocol engine * @doi: the DOI value * @audit_info: NetLabel audit information * * Description: * Removes a DOI definition from the CALIPSO engine. The NetLabel routines will * be called to release their own LSM domain mappings as well as our own * domain list. Returns zero on success and negative values on failure. * */ static int calipso_doi_remove(u32 doi, struct netlbl_audit *audit_info) { int ret_val; struct calipso_doi *doi_def; struct audit_buffer *audit_buf; spin_lock(&calipso_doi_list_lock); doi_def = calipso_doi_search(doi); if (!doi_def) { spin_unlock(&calipso_doi_list_lock); ret_val = -ENOENT; goto doi_remove_return; } list_del_rcu(&doi_def->list); spin_unlock(&calipso_doi_list_lock); calipso_doi_putdef(doi_def); ret_val = 0; doi_remove_return: audit_buf = netlbl_audit_start(AUDIT_MAC_CALIPSO_DEL, audit_info); if (audit_buf) { audit_log_format(audit_buf, " calipso_doi=%u res=%u", doi, ret_val == 0 ? 1 : 0); audit_log_end(audit_buf); } return ret_val; } /** * calipso_doi_getdef - Returns a reference to a valid DOI definition * @doi: the DOI value * * Description: * Searches for a valid DOI definition and if one is found it is returned to * the caller. Otherwise NULL is returned. The caller must ensure that * calipso_doi_putdef() is called when the caller is done. * */ static struct calipso_doi *calipso_doi_getdef(u32 doi) { struct calipso_doi *doi_def; rcu_read_lock(); doi_def = calipso_doi_search(doi); if (!doi_def) goto doi_getdef_return; if (!refcount_inc_not_zero(&doi_def->refcount)) doi_def = NULL; doi_getdef_return: rcu_read_unlock(); return doi_def; } /** * calipso_doi_putdef - Releases a reference for the given DOI definition * @doi_def: the DOI definition * * Description: * Releases a DOI definition reference obtained from calipso_doi_getdef(). * */ static void calipso_doi_putdef(struct calipso_doi *doi_def) { if (!doi_def) return; if (!refcount_dec_and_test(&doi_def->refcount)) return; calipso_cache_invalidate(); call_rcu(&doi_def->rcu, calipso_doi_free_rcu); } /** * calipso_doi_walk - Iterate through the DOI definitions * @skip_cnt: skip past this number of DOI definitions, updated * @callback: callback for each DOI definition * @cb_arg: argument for the callback function * * Description: * Iterate over the DOI definition list, skipping the first @skip_cnt entries. * For each entry call @callback, if @callback returns a negative value stop * 'walking' through the list and return. Updates the value in @skip_cnt upon * return. Returns zero on success, negative values on failure. * */ static int calipso_doi_walk(u32 *skip_cnt, int (*callback)(struct calipso_doi *doi_def, void *arg), void *cb_arg) { int ret_val = -ENOENT; u32 doi_cnt = 0; struct calipso_doi *iter_doi; rcu_read_lock(); list_for_each_entry_rcu(iter_doi, &calipso_doi_list, list) if (refcount_read(&iter_doi->refcount) > 0) { if (doi_cnt++ < *skip_cnt) continue; ret_val = callback(iter_doi, cb_arg); if (ret_val < 0) { doi_cnt--; goto doi_walk_return; } } doi_walk_return: rcu_read_unlock(); *skip_cnt = doi_cnt; return ret_val; } /** * calipso_validate - Validate a CALIPSO option * @skb: the packet * @option: the start of the option * * Description: * This routine is called to validate a CALIPSO option. * If the option is valid then %true is returned, otherwise * %false is returned. * * The caller should have already checked that the length of the * option (including the TLV header) is >= 10 and that the catmap * length is consistent with the option length. * * We leave checks on the level and categories to the socket layer. */ bool calipso_validate(const struct sk_buff *skb, const unsigned char *option) { struct calipso_doi *doi_def; bool ret_val; u16 crc, len = option[1] + 2; static const u8 zero[2]; /* The original CRC runs over the option including the TLV header * with the CRC-16 field (at offset 8) zeroed out. */ crc = crc_ccitt(0xffff, option, 8); crc = crc_ccitt(crc, zero, sizeof(zero)); if (len > 10) crc = crc_ccitt(crc, option + 10, len - 10); crc = ~crc; if (option[8] != (crc & 0xff) || option[9] != ((crc >> 8) & 0xff)) return false; rcu_read_lock(); doi_def = calipso_doi_search(get_unaligned_be32(option + 2)); ret_val = !!doi_def; rcu_read_unlock(); return ret_val; } /** * calipso_map_cat_hton - Perform a category mapping from host to network * @doi_def: the DOI definition * @secattr: the security attributes * @net_cat: the zero'd out category bitmap in network/CALIPSO format * @net_cat_len: the length of the CALIPSO bitmap in bytes * * Description: * Perform a label mapping to translate a local MLS category bitmap to the * correct CALIPSO bitmap using the given DOI definition. Returns the minimum * size in bytes of the network bitmap on success, negative values otherwise. * */ static int calipso_map_cat_hton(const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr, unsigned char *net_cat, u32 net_cat_len) { int spot = -1; u32 net_spot_max = 0; u32 net_clen_bits = net_cat_len * 8; for (;;) { spot = netlbl_catmap_walk(secattr->attr.mls.cat, spot + 1); if (spot < 0) break; if (spot >= net_clen_bits) return -ENOSPC; netlbl_bitmap_setbit(net_cat, spot, 1); if (spot > net_spot_max) net_spot_max = spot; } return (net_spot_max / 32 + 1) * 4; } /** * calipso_map_cat_ntoh - Perform a category mapping from network to host * @doi_def: the DOI definition * @net_cat: the category bitmap in network/CALIPSO format * @net_cat_len: the length of the CALIPSO bitmap in bytes * @secattr: the security attributes * * Description: * Perform a label mapping to translate a CALIPSO bitmap to the correct local * MLS category bitmap using the given DOI definition. Returns zero on * success, negative values on failure. * */ static int calipso_map_cat_ntoh(const struct calipso_doi *doi_def, const unsigned char *net_cat, u32 net_cat_len, struct netlbl_lsm_secattr *secattr) { int ret_val; int spot = -1; u32 net_clen_bits = net_cat_len * 8; for (;;) { spot = netlbl_bitmap_walk(net_cat, net_clen_bits, spot + 1, 1); if (spot < 0) { if (spot == -2) return -EFAULT; return 0; } ret_val = netlbl_catmap_setbit(&secattr->attr.mls.cat, spot, GFP_ATOMIC); if (ret_val != 0) return ret_val; } return -EINVAL; } /** * calipso_pad_write - Writes pad bytes in TLV format * @buf: the buffer * @offset: offset from start of buffer to write padding * @count: number of pad bytes to write * * Description: * Write @count bytes of TLV padding into @buffer starting at offset @offset. * @count should be less than 8 - see RFC 4942. * */ static int calipso_pad_write(unsigned char *buf, unsigned int offset, unsigned int count) { if (WARN_ON_ONCE(count >= 8)) return -EINVAL; switch (count) { case 0: break; case 1: buf[offset] = IPV6_TLV_PAD1; break; default: buf[offset] = IPV6_TLV_PADN; buf[offset + 1] = count - 2; if (count > 2) memset(buf + offset + 2, 0, count - 2); break; } return 0; } /** * calipso_genopt - Generate a CALIPSO option * @buf: the option buffer * @start: offset from which to write * @buf_len: the size of opt_buf * @doi_def: the CALIPSO DOI to use * @secattr: the security attributes * * Description: * Generate a CALIPSO option using the DOI definition and security attributes * passed to the function. This also generates upto three bytes of leading * padding that ensures that the option is 4n + 2 aligned. It returns the * number of bytes written (including any initial padding). */ static int calipso_genopt(unsigned char *buf, u32 start, u32 buf_len, const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { int ret_val; u32 len, pad; u16 crc; static const unsigned char padding[4] = {2, 1, 0, 3}; unsigned char *calipso; /* CALIPSO has 4n + 2 alignment */ pad = padding[start & 3]; if (buf_len <= start + pad + CALIPSO_HDR_LEN) return -ENOSPC; if ((secattr->flags & NETLBL_SECATTR_MLS_LVL) == 0) return -EPERM; len = CALIPSO_HDR_LEN; if (secattr->flags & NETLBL_SECATTR_MLS_CAT) { ret_val = calipso_map_cat_hton(doi_def, secattr, buf + start + pad + len, buf_len - start - pad - len); if (ret_val < 0) return ret_val; len += ret_val; } calipso_pad_write(buf, start, pad); calipso = buf + start + pad; calipso[0] = IPV6_TLV_CALIPSO; calipso[1] = len - 2; *(__be32 *)(calipso + 2) = htonl(doi_def->doi); calipso[6] = (len - CALIPSO_HDR_LEN) / 4; calipso[7] = secattr->attr.mls.lvl; crc = ~crc_ccitt(0xffff, calipso, len); calipso[8] = crc & 0xff; calipso[9] = (crc >> 8) & 0xff; return pad + len; } /* Hop-by-hop hdr helper functions */ /** * calipso_opt_update - Replaces socket's hop options with a new set * @sk: the socket * @hop: new hop options * * Description: * Replaces @sk's hop options with @hop. @hop may be NULL to leave * the socket with no hop options. * */ static int calipso_opt_update(struct sock *sk, struct ipv6_opt_hdr *hop) { struct ipv6_txoptions *old = txopt_get(inet6_sk(sk)), *txopts; txopts = ipv6_renew_options(sk, old, IPV6_HOPOPTS, hop); txopt_put(old); if (IS_ERR(txopts)) return PTR_ERR(txopts); txopts = ipv6_update_options(sk, txopts); if (txopts) { atomic_sub(txopts->tot_len, &sk->sk_omem_alloc); txopt_put(txopts); } return 0; } /** * calipso_tlv_len - Returns the length of the TLV * @opt: the option header * @offset: offset of the TLV within the header * * Description: * Returns the length of the TLV option at offset @offset within * the option header @opt. Checks that the entire TLV fits inside * the option header, returns a negative value if this is not the case. */ static int calipso_tlv_len(struct ipv6_opt_hdr *opt, unsigned int offset) { unsigned char *tlv = (unsigned char *)opt; unsigned int opt_len = ipv6_optlen(opt), tlv_len; if (offset < sizeof(*opt) || offset >= opt_len) return -EINVAL; if (tlv[offset] == IPV6_TLV_PAD1) return 1; if (offset + 1 >= opt_len) return -EINVAL; tlv_len = tlv[offset + 1] + 2; if (offset + tlv_len > opt_len) return -EINVAL; return tlv_len; } /** * calipso_opt_find - Finds the CALIPSO option in an IPv6 hop options header * @hop: the hop options header * @start: on return holds the offset of any leading padding * @end: on return holds the offset of the first non-pad TLV after CALIPSO * * Description: * Finds the space occupied by a CALIPSO option (including any leading and * trailing padding). * * If a CALIPSO option exists set @start and @end to the * offsets within @hop of the start of padding before the first * CALIPSO option and the end of padding after the first CALIPSO * option. In this case the function returns 0. * * In the absence of a CALIPSO option, @start and @end will be * set to the start and end of any trailing padding in the header. * This is useful when appending a new option, as the caller may want * to overwrite some of this padding. In this case the function will * return -ENOENT. */ static int calipso_opt_find(struct ipv6_opt_hdr *hop, unsigned int *start, unsigned int *end) { int ret_val = -ENOENT, tlv_len; unsigned int opt_len, offset, offset_s = 0, offset_e = 0; unsigned char *opt = (unsigned char *)hop; opt_len = ipv6_optlen(hop); offset = sizeof(*hop); while (offset < opt_len) { tlv_len = calipso_tlv_len(hop, offset); if (tlv_len < 0) return tlv_len; switch (opt[offset]) { case IPV6_TLV_PAD1: case IPV6_TLV_PADN: if (offset_e) offset_e = offset; break; case IPV6_TLV_CALIPSO: ret_val = 0; offset_e = offset; break; default: if (offset_e == 0) offset_s = offset; else goto out; } offset += tlv_len; } out: if (offset_s) *start = offset_s + calipso_tlv_len(hop, offset_s); else *start = sizeof(*hop); if (offset_e) *end = offset_e + calipso_tlv_len(hop, offset_e); else *end = opt_len; return ret_val; } /** * calipso_opt_insert - Inserts a CALIPSO option into an IPv6 hop opt hdr * @hop: the original hop options header * @doi_def: the CALIPSO DOI to use * @secattr: the specific security attributes of the socket * * Description: * Creates a new hop options header based on @hop with a * CALIPSO option added to it. If @hop already contains a CALIPSO * option this is overwritten, otherwise the new option is appended * after any existing options. If @hop is NULL then the new header * will contain just the CALIPSO option and any needed padding. * */ static struct ipv6_opt_hdr * calipso_opt_insert(struct ipv6_opt_hdr *hop, const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { unsigned int start, end, buf_len, pad, hop_len; struct ipv6_opt_hdr *new; int ret_val; if (hop) { hop_len = ipv6_optlen(hop); ret_val = calipso_opt_find(hop, &start, &end); if (ret_val && ret_val != -ENOENT) return ERR_PTR(ret_val); } else { hop_len = 0; start = sizeof(*hop); end = 0; } buf_len = hop_len + start - end + CALIPSO_OPT_LEN_MAX_WITH_PAD; new = kzalloc(buf_len, GFP_ATOMIC); if (!new) return ERR_PTR(-ENOMEM); if (start > sizeof(*hop)) memcpy(new, hop, start); ret_val = calipso_genopt((unsigned char *)new, start, buf_len, doi_def, secattr); if (ret_val < 0) { kfree(new); return ERR_PTR(ret_val); } buf_len = start + ret_val; /* At this point buf_len aligns to 4n, so (buf_len & 4) pads to 8n */ pad = ((buf_len & 4) + (end & 7)) & 7; calipso_pad_write((unsigned char *)new, buf_len, pad); buf_len += pad; if (end != hop_len) { memcpy((char *)new + buf_len, (char *)hop + end, hop_len - end); buf_len += hop_len - end; } new->nexthdr = 0; new->hdrlen = buf_len / 8 - 1; return new; } /** * calipso_opt_del - Removes the CALIPSO option from an option header * @hop: the original header * @new: the new header * * Description: * Creates a new header based on @hop without any CALIPSO option. If @hop * doesn't contain a CALIPSO option it returns -ENOENT. If @hop contains * no other non-padding options, it returns zero with @new set to NULL. * Otherwise it returns zero, creates a new header without the CALIPSO * option (and removing as much padding as possible) and returns with * @new set to that header. * */ static int calipso_opt_del(struct ipv6_opt_hdr *hop, struct ipv6_opt_hdr **new) { int ret_val; unsigned int start, end, delta, pad, hop_len; ret_val = calipso_opt_find(hop, &start, &end); if (ret_val) return ret_val; hop_len = ipv6_optlen(hop); if (start == sizeof(*hop) && end == hop_len) { /* There's no other option in the header so return NULL */ *new = NULL; return 0; } delta = (end - start) & ~7; *new = kzalloc(hop_len - delta, GFP_ATOMIC); if (!*new) return -ENOMEM; memcpy(*new, hop, start); (*new)->hdrlen -= delta / 8; pad = (end - start) & 7; calipso_pad_write((unsigned char *)*new, start, pad); if (end != hop_len) memcpy((char *)*new + start + pad, (char *)hop + end, hop_len - end); return 0; } /** * calipso_opt_getattr - Get the security attributes from a memory block * @calipso: the CALIPSO option * @secattr: the security attributes * * Description: * Inspect @calipso and return the security attributes in @secattr. * Returns zero on success and negative values on failure. * */ static int calipso_opt_getattr(const unsigned char *calipso, struct netlbl_lsm_secattr *secattr) { int ret_val = -ENOMSG; u32 doi, len = calipso[1], cat_len = calipso[6] * 4; struct calipso_doi *doi_def; if (cat_len + 8 > len) return -EINVAL; if (calipso_cache_check(calipso + 2, calipso[1], secattr) == 0) return 0; doi = get_unaligned_be32(calipso + 2); rcu_read_lock(); doi_def = calipso_doi_search(doi); if (!doi_def) goto getattr_return; secattr->attr.mls.lvl = calipso[7]; secattr->flags |= NETLBL_SECATTR_MLS_LVL; if (cat_len) { ret_val = calipso_map_cat_ntoh(doi_def, calipso + 10, cat_len, secattr); if (ret_val != 0) { netlbl_catmap_free(secattr->attr.mls.cat); goto getattr_return; } if (secattr->attr.mls.cat) secattr->flags |= NETLBL_SECATTR_MLS_CAT; } secattr->type = NETLBL_NLTYPE_CALIPSO; getattr_return: rcu_read_unlock(); return ret_val; } /* sock functions. */ /** * calipso_sock_getattr - Get the security attributes from a sock * @sk: the sock * @secattr: the security attributes * * Description: * Query @sk to see if there is a CALIPSO option attached to the sock and if * there is return the CALIPSO security attributes in @secattr. This function * requires that @sk be locked, or privately held, but it does not do any * locking itself. Returns zero on success and negative values on failure. * */ static int calipso_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) { struct ipv6_opt_hdr *hop; int opt_len, len, ret_val = -ENOMSG, offset; unsigned char *opt; struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk)); if (!txopts || !txopts->hopopt) goto done; hop = txopts->hopopt; opt = (unsigned char *)hop; opt_len = ipv6_optlen(hop); offset = sizeof(*hop); while (offset < opt_len) { len = calipso_tlv_len(hop, offset); if (len < 0) { ret_val = len; goto done; } switch (opt[offset]) { case IPV6_TLV_CALIPSO: if (len < CALIPSO_HDR_LEN) ret_val = -EINVAL; else ret_val = calipso_opt_getattr(&opt[offset], secattr); goto done; default: offset += len; break; } } done: txopt_put(txopts); return ret_val; } /** * calipso_sock_setattr - Add a CALIPSO option to a socket * @sk: the socket * @doi_def: the CALIPSO DOI to use * @secattr: the specific security attributes of the socket * * Description: * Set the CALIPSO option on the given socket using the DOI definition and * security attributes passed to the function. This function requires * exclusive access to @sk, which means it either needs to be in the * process of being created or locked. Returns zero on success and negative * values on failure. * */ static int calipso_sock_setattr(struct sock *sk, const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { int ret_val; struct ipv6_opt_hdr *old, *new; struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk)); old = NULL; if (txopts) old = txopts->hopopt; new = calipso_opt_insert(old, doi_def, secattr); txopt_put(txopts); if (IS_ERR(new)) return PTR_ERR(new); ret_val = calipso_opt_update(sk, new); kfree(new); return ret_val; } /** * calipso_sock_delattr - Delete the CALIPSO option from a socket * @sk: the socket * * Description: * Removes the CALIPSO option from a socket, if present. * */ static void calipso_sock_delattr(struct sock *sk) { struct ipv6_opt_hdr *new_hop; struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk)); if (!txopts || !txopts->hopopt) goto done; if (calipso_opt_del(txopts->hopopt, &new_hop)) goto done; calipso_opt_update(sk, new_hop); kfree(new_hop); done: txopt_put(txopts); } /* request sock functions. */ /** * calipso_req_setattr - Add a CALIPSO option to a connection request socket * @req: the connection request socket * @doi_def: the CALIPSO DOI to use * @secattr: the specific security attributes of the socket * * Description: * Set the CALIPSO option on the given socket using the DOI definition and * security attributes passed to the function. Returns zero on success and * negative values on failure. * */ static int calipso_req_setattr(struct request_sock *req, const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { struct ipv6_txoptions *txopts; struct inet_request_sock *req_inet = inet_rsk(req); struct ipv6_opt_hdr *old, *new; struct sock *sk = sk_to_full_sk(req_to_sk(req)); if (req_inet->ipv6_opt && req_inet->ipv6_opt->hopopt) old = req_inet->ipv6_opt->hopopt; else old = NULL; new = calipso_opt_insert(old, doi_def, secattr); if (IS_ERR(new)) return PTR_ERR(new); txopts = ipv6_renew_options(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, new); kfree(new); if (IS_ERR(txopts)) return PTR_ERR(txopts); txopts = xchg(&req_inet->ipv6_opt, txopts); if (txopts) { atomic_sub(txopts->tot_len, &sk->sk_omem_alloc); txopt_put(txopts); } return 0; } /** * calipso_req_delattr - Delete the CALIPSO option from a request socket * @req: the request socket * * Description: * Removes the CALIPSO option from a request socket, if present. * */ static void calipso_req_delattr(struct request_sock *req) { struct inet_request_sock *req_inet = inet_rsk(req); struct ipv6_opt_hdr *new; struct ipv6_txoptions *txopts; struct sock *sk = sk_to_full_sk(req_to_sk(req)); if (!req_inet->ipv6_opt || !req_inet->ipv6_opt->hopopt) return; if (calipso_opt_del(req_inet->ipv6_opt->hopopt, &new)) return; /* Nothing to do */ txopts = ipv6_renew_options(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, new); if (!IS_ERR(txopts)) { txopts = xchg(&req_inet->ipv6_opt, txopts); if (txopts) { atomic_sub(txopts->tot_len, &sk->sk_omem_alloc); txopt_put(txopts); } } kfree(new); } /* skbuff functions. */ /** * calipso_skbuff_optptr - Find the CALIPSO option in the packet * @skb: the packet * * Description: * Parse the packet's IP header looking for a CALIPSO option. Returns a pointer * to the start of the CALIPSO option on success, NULL if one if not found. * */ static unsigned char *calipso_skbuff_optptr(const struct sk_buff *skb) { const struct ipv6hdr *ip6_hdr = ipv6_hdr(skb); int offset; if (ip6_hdr->nexthdr != NEXTHDR_HOP) return NULL; offset = ipv6_find_tlv(skb, sizeof(*ip6_hdr), IPV6_TLV_CALIPSO); if (offset >= 0) return (unsigned char *)ip6_hdr + offset; return NULL; } /** * calipso_skbuff_setattr - Set the CALIPSO option on a packet * @skb: the packet * @doi_def: the CALIPSO DOI to use * @secattr: the security attributes * * Description: * Set the CALIPSO option on the given packet based on the security attributes. * Returns a pointer to the IP header on success and NULL on failure. * */ static int calipso_skbuff_setattr(struct sk_buff *skb, const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { int ret_val; struct ipv6hdr *ip6_hdr; struct ipv6_opt_hdr *hop; unsigned char buf[CALIPSO_MAX_BUFFER]; int len_delta, new_end, pad, payload; unsigned int start, end; ip6_hdr = ipv6_hdr(skb); if (ip6_hdr->nexthdr == NEXTHDR_HOP) { hop = (struct ipv6_opt_hdr *)(ip6_hdr + 1); ret_val = calipso_opt_find(hop, &start, &end); if (ret_val && ret_val != -ENOENT) return ret_val; } else { start = 0; end = 0; } memset(buf, 0, sizeof(buf)); ret_val = calipso_genopt(buf, start & 3, sizeof(buf), doi_def, secattr); if (ret_val < 0) return ret_val; new_end = start + ret_val; /* At this point new_end aligns to 4n, so (new_end & 4) pads to 8n */ pad = ((new_end & 4) + (end & 7)) & 7; len_delta = new_end - (int)end + pad; ret_val = skb_cow(skb, skb_headroom(skb) + len_delta); if (ret_val < 0) return ret_val; ip6_hdr = ipv6_hdr(skb); /* Reset as skb_cow() may have moved it */ if (len_delta) { if (len_delta > 0) skb_push(skb, len_delta); else skb_pull(skb, -len_delta); memmove((char *)ip6_hdr - len_delta, ip6_hdr, sizeof(*ip6_hdr) + start); skb_reset_network_header(skb); ip6_hdr = ipv6_hdr(skb); payload = ntohs(ip6_hdr->payload_len); ip6_hdr->payload_len = htons(payload + len_delta); } hop = (struct ipv6_opt_hdr *)(ip6_hdr + 1); if (start == 0) { struct ipv6_opt_hdr *new_hop = (struct ipv6_opt_hdr *)buf; new_hop->nexthdr = ip6_hdr->nexthdr; new_hop->hdrlen = len_delta / 8 - 1; ip6_hdr->nexthdr = NEXTHDR_HOP; } else { hop->hdrlen += len_delta / 8; } memcpy((char *)hop + start, buf + (start & 3), new_end - start); calipso_pad_write((unsigned char *)hop, new_end, pad); return 0; } /** * calipso_skbuff_delattr - Delete any CALIPSO options from a packet * @skb: the packet * * Description: * Removes any and all CALIPSO options from the given packet. Returns zero on * success, negative values on failure. * */ static int calipso_skbuff_delattr(struct sk_buff *skb) { int ret_val; struct ipv6hdr *ip6_hdr; struct ipv6_opt_hdr *old_hop; u32 old_hop_len, start = 0, end = 0, delta, size, pad; if (!calipso_skbuff_optptr(skb)) return 0; /* since we are changing the packet we should make a copy */ ret_val = skb_cow(skb, skb_headroom(skb)); if (ret_val < 0) return ret_val; ip6_hdr = ipv6_hdr(skb); old_hop = (struct ipv6_opt_hdr *)(ip6_hdr + 1); old_hop_len = ipv6_optlen(old_hop); ret_val = calipso_opt_find(old_hop, &start, &end); if (ret_val) return ret_val; if (start == sizeof(*old_hop) && end == old_hop_len) { /* There's no other option in the header so we delete * the whole thing. */ delta = old_hop_len; size = sizeof(*ip6_hdr); ip6_hdr->nexthdr = old_hop->nexthdr; } else { delta = (end - start) & ~7; if (delta) old_hop->hdrlen -= delta / 8; pad = (end - start) & 7; size = sizeof(*ip6_hdr) + start + pad; calipso_pad_write((unsigned char *)old_hop, start, pad); } if (delta) { skb_pull(skb, delta); memmove((char *)ip6_hdr + delta, ip6_hdr, size); skb_reset_network_header(skb); } return 0; } static const struct netlbl_calipso_ops ops = { .doi_add = calipso_doi_add, .doi_free = calipso_doi_free, .doi_remove = calipso_doi_remove, .doi_getdef = calipso_doi_getdef, .doi_putdef = calipso_doi_putdef, .doi_walk = calipso_doi_walk, .sock_getattr = calipso_sock_getattr, .sock_setattr = calipso_sock_setattr, .sock_delattr = calipso_sock_delattr, .req_setattr = calipso_req_setattr, .req_delattr = calipso_req_delattr, .opt_getattr = calipso_opt_getattr, .skbuff_optptr = calipso_skbuff_optptr, .skbuff_setattr = calipso_skbuff_setattr, .skbuff_delattr = calipso_skbuff_delattr, .cache_invalidate = calipso_cache_invalidate, .cache_add = calipso_cache_add }; /** * calipso_init - Initialize the CALIPSO module * * Description: * Initialize the CALIPSO module and prepare it for use. Returns zero on * success and negative values on failure. * */ int __init calipso_init(void) { int ret_val; ret_val = calipso_cache_init(); if (!ret_val) netlbl_calipso_ops_register(&ops); return ret_val; } void calipso_exit(void) { netlbl_calipso_ops_register(NULL); calipso_cache_invalidate(); kfree(calipso_cache); } |
26 7 26 6 6 6 6 24 13 23 23 23 1 1 51 38 36 33 48 8 32 32 31 29 11 32 29 27 8 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2021 Facebook */ #include <linux/bitmap.h> #include <linux/bpf.h> #include <linux/btf.h> #include <linux/err.h> #include <linux/jhash.h> #include <linux/random.h> #include <linux/btf_ids.h> #define BLOOM_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_ZERO_SEED | BPF_F_ACCESS_MASK) struct bpf_bloom_filter { struct bpf_map map; u32 bitset_mask; u32 hash_seed; u32 nr_hash_funcs; unsigned long bitset[]; }; static u32 hash(struct bpf_bloom_filter *bloom, void *value, u32 value_size, u32 index) { u32 h; if (likely(value_size % 4 == 0)) h = jhash2(value, value_size / 4, bloom->hash_seed + index); else h = jhash(value, value_size, bloom->hash_seed + index); return h & bloom->bitset_mask; } static long bloom_map_peek_elem(struct bpf_map *map, void *value) { struct bpf_bloom_filter *bloom = container_of(map, struct bpf_bloom_filter, map); u32 i, h; for (i = 0; i < bloom->nr_hash_funcs; i++) { h = hash(bloom, value, map->value_size, i); if (!test_bit(h, bloom->bitset)) return -ENOENT; } return 0; } static long bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags) { struct bpf_bloom_filter *bloom = container_of(map, struct bpf_bloom_filter, map); u32 i, h; if (flags != BPF_ANY) return -EINVAL; for (i = 0; i < bloom->nr_hash_funcs; i++) { h = hash(bloom, value, map->value_size, i); set_bit(h, bloom->bitset); } return 0; } static long bloom_map_pop_elem(struct bpf_map *map, void *value) { return -EOPNOTSUPP; } static long bloom_map_delete_elem(struct bpf_map *map, void *value) { return -EOPNOTSUPP; } static int bloom_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { return -EOPNOTSUPP; } static struct bpf_map *bloom_map_alloc(union bpf_attr *attr) { u32 bitset_bytes, bitset_mask, nr_hash_funcs, nr_bits; int numa_node = bpf_map_attr_numa_node(attr); struct bpf_bloom_filter *bloom; if (attr->key_size != 0 || attr->value_size == 0 || attr->max_entries == 0 || attr->map_flags & ~BLOOM_CREATE_FLAG_MASK || !bpf_map_flags_access_ok(attr->map_flags) || /* The lower 4 bits of map_extra (0xF) specify the number * of hash functions */ (attr->map_extra & ~0xF)) return ERR_PTR(-EINVAL); nr_hash_funcs = attr->map_extra; if (nr_hash_funcs == 0) /* Default to using 5 hash functions if unspecified */ nr_hash_funcs = 5; /* For the bloom filter, the optimal bit array size that minimizes the * false positive probability is n * k / ln(2) where n is the number of * expected entries in the bloom filter and k is the number of hash * functions. We use 7 / 5 to approximate 1 / ln(2). * * We round this up to the nearest power of two to enable more efficient * hashing using bitmasks. The bitmask will be the bit array size - 1. * * If this overflows a u32, the bit array size will have 2^32 (4 * GB) bits. */ if (check_mul_overflow(attr->max_entries, nr_hash_funcs, &nr_bits) || check_mul_overflow(nr_bits / 5, (u32)7, &nr_bits) || nr_bits > (1UL << 31)) { /* The bit array size is 2^32 bits but to avoid overflowing the * u32, we use U32_MAX, which will round up to the equivalent * number of bytes */ bitset_bytes = BITS_TO_BYTES(U32_MAX); bitset_mask = U32_MAX; } else { if (nr_bits <= BITS_PER_LONG) nr_bits = BITS_PER_LONG; else nr_bits = roundup_pow_of_two(nr_bits); bitset_bytes = BITS_TO_BYTES(nr_bits); bitset_mask = nr_bits - 1; } bitset_bytes = roundup(bitset_bytes, sizeof(unsigned long)); bloom = bpf_map_area_alloc(sizeof(*bloom) + bitset_bytes, numa_node); if (!bloom) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&bloom->map, attr); bloom->nr_hash_funcs = nr_hash_funcs; bloom->bitset_mask = bitset_mask; if (!(attr->map_flags & BPF_F_ZERO_SEED)) bloom->hash_seed = get_random_u32(); return &bloom->map; } static void bloom_map_free(struct bpf_map *map) { struct bpf_bloom_filter *bloom = container_of(map, struct bpf_bloom_filter, map); bpf_map_area_free(bloom); } static void *bloom_map_lookup_elem(struct bpf_map *map, void *key) { /* The eBPF program should use map_peek_elem instead */ return ERR_PTR(-EINVAL); } static long bloom_map_update_elem(struct bpf_map *map, void *key, void *value, u64 flags) { /* The eBPF program should use map_push_elem instead */ return -EINVAL; } static int bloom_map_check_btf(const struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) { /* Bloom filter maps are keyless */ return btf_type_is_void(key_type) ? 0 : -EINVAL; } static u64 bloom_map_mem_usage(const struct bpf_map *map) { struct bpf_bloom_filter *bloom; u64 bitset_bytes; bloom = container_of(map, struct bpf_bloom_filter, map); bitset_bytes = BITS_TO_BYTES((u64)bloom->bitset_mask + 1); bitset_bytes = roundup(bitset_bytes, sizeof(unsigned long)); return sizeof(*bloom) + bitset_bytes; } BTF_ID_LIST_SINGLE(bpf_bloom_map_btf_ids, struct, bpf_bloom_filter) const struct bpf_map_ops bloom_filter_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = bloom_map_alloc, .map_free = bloom_map_free, .map_get_next_key = bloom_map_get_next_key, .map_push_elem = bloom_map_push_elem, .map_peek_elem = bloom_map_peek_elem, .map_pop_elem = bloom_map_pop_elem, .map_lookup_elem = bloom_map_lookup_elem, .map_update_elem = bloom_map_update_elem, .map_delete_elem = bloom_map_delete_elem, .map_check_btf = bloom_map_check_btf, .map_mem_usage = bloom_map_mem_usage, .map_btf_id = &bpf_bloom_map_btf_ids[0], }; |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ #include "peerlookup.h" #include "peer.h" #include "noise.h" static struct hlist_head *pubkey_bucket(struct pubkey_hashtable *table, const u8 pubkey[NOISE_PUBLIC_KEY_LEN]) { /* siphash gives us a secure 64bit number based on a random key. Since * the bits are uniformly distributed, we can then mask off to get the * bits we need. */ const u64 hash = siphash(pubkey, NOISE_PUBLIC_KEY_LEN, &table->key); return &table->hashtable[hash & (HASH_SIZE(table->hashtable) - 1)]; } struct pubkey_hashtable *wg_pubkey_hashtable_alloc(void) { struct pubkey_hashtable *table = kvmalloc(sizeof(*table), GFP_KERNEL); if (!table) return NULL; get_random_bytes(&table->key, sizeof(table->key)); hash_init(table->hashtable); mutex_init(&table->lock); return table; } void wg_pubkey_hashtable_add(struct pubkey_hashtable *table, struct wg_peer *peer) { mutex_lock(&table->lock); hlist_add_head_rcu(&peer->pubkey_hash, pubkey_bucket(table, peer->handshake.remote_static)); mutex_unlock(&table->lock); } void wg_pubkey_hashtable_remove(struct pubkey_hashtable *table, struct wg_peer *peer) { mutex_lock(&table->lock); hlist_del_init_rcu(&peer->pubkey_hash); mutex_unlock(&table->lock); } /* Returns a strong reference to a peer */ struct wg_peer * wg_pubkey_hashtable_lookup(struct pubkey_hashtable *table, const u8 pubkey[NOISE_PUBLIC_KEY_LEN]) { struct wg_peer *iter_peer, *peer = NULL; rcu_read_lock_bh(); hlist_for_each_entry_rcu_bh(iter_peer, pubkey_bucket(table, pubkey), pubkey_hash) { if (!memcmp(pubkey, iter_peer->handshake.remote_static, NOISE_PUBLIC_KEY_LEN)) { peer = iter_peer; break; } } peer = wg_peer_get_maybe_zero(peer); rcu_read_unlock_bh(); return peer; } static struct hlist_head *index_bucket(struct index_hashtable *table, const __le32 index) { /* Since the indices are random and thus all bits are uniformly * distributed, we can find its bucket simply by masking. */ return &table->hashtable[(__force u32)index & (HASH_SIZE(table->hashtable) - 1)]; } struct index_hashtable *wg_index_hashtable_alloc(void) { struct index_hashtable *table = kvmalloc(sizeof(*table), GFP_KERNEL); if (!table) return NULL; hash_init(table->hashtable); spin_lock_init(&table->lock); return table; } /* At the moment, we limit ourselves to 2^20 total peers, which generally might * amount to 2^20*3 items in this hashtable. The algorithm below works by * picking a random number and testing it. We can see that these limits mean we * usually succeed pretty quickly: * * >>> def calculation(tries, size): * ... return (size / 2**32)**(tries - 1) * (1 - (size / 2**32)) * ... * >>> calculation(1, 2**20 * 3) * 0.999267578125 * >>> calculation(2, 2**20 * 3) * 0.0007318854331970215 * >>> calculation(3, 2**20 * 3) * 5.360489012673497e-07 * >>> calculation(4, 2**20 * 3) * 3.9261394135792216e-10 * * At the moment, we don't do any masking, so this algorithm isn't exactly * constant time in either the random guessing or in the hash list lookup. We * could require a minimum of 3 tries, which would successfully mask the * guessing. this would not, however, help with the growing hash lengths, which * is another thing to consider moving forward. */ __le32 wg_index_hashtable_insert(struct index_hashtable *table, struct index_hashtable_entry *entry) { struct index_hashtable_entry *existing_entry; spin_lock_bh(&table->lock); hlist_del_init_rcu(&entry->index_hash); spin_unlock_bh(&table->lock); rcu_read_lock_bh(); search_unused_slot: /* First we try to find an unused slot, randomly, while unlocked. */ entry->index = (__force __le32)get_random_u32(); hlist_for_each_entry_rcu_bh(existing_entry, index_bucket(table, entry->index), index_hash) { if (existing_entry->index == entry->index) /* If it's already in use, we continue searching. */ goto search_unused_slot; } /* Once we've found an unused slot, we lock it, and then double-check * that nobody else stole it from us. */ spin_lock_bh(&table->lock); hlist_for_each_entry_rcu_bh(existing_entry, index_bucket(table, entry->index), index_hash) { if (existing_entry->index == entry->index) { spin_unlock_bh(&table->lock); /* If it was stolen, we start over. */ goto search_unused_slot; } } /* Otherwise, we know we have it exclusively (since we're locked), * so we insert. */ hlist_add_head_rcu(&entry->index_hash, index_bucket(table, entry->index)); spin_unlock_bh(&table->lock); rcu_read_unlock_bh(); return entry->index; } bool wg_index_hashtable_replace(struct index_hashtable *table, struct index_hashtable_entry *old, struct index_hashtable_entry *new) { bool ret; spin_lock_bh(&table->lock); ret = !hlist_unhashed(&old->index_hash); if (unlikely(!ret)) goto out; new->index = old->index; hlist_replace_rcu(&old->index_hash, &new->index_hash); /* Calling init here NULLs out index_hash, and in fact after this * function returns, it's theoretically possible for this to get * reinserted elsewhere. That means the RCU lookup below might either * terminate early or jump between buckets, in which case the packet * simply gets dropped, which isn't terrible. */ INIT_HLIST_NODE(&old->index_hash); out: spin_unlock_bh(&table->lock); return ret; } void wg_index_hashtable_remove(struct index_hashtable *table, struct index_hashtable_entry *entry) { spin_lock_bh(&table->lock); hlist_del_init_rcu(&entry->index_hash); spin_unlock_bh(&table->lock); } /* Returns a strong reference to a entry->peer */ struct index_hashtable_entry * wg_index_hashtable_lookup(struct index_hashtable *table, const enum index_hashtable_type type_mask, const __le32 index, struct wg_peer **peer) { struct index_hashtable_entry *iter_entry, *entry = NULL; rcu_read_lock_bh(); hlist_for_each_entry_rcu_bh(iter_entry, index_bucket(table, index), index_hash) { if (iter_entry->index == index) { if (likely(iter_entry->type & type_mask)) entry = iter_entry; break; } } if (likely(entry)) { entry->peer = wg_peer_get_maybe_zero(entry->peer); if (likely(entry->peer)) *peer = entry->peer; else entry = NULL; } rcu_read_unlock_bh(); return entry; } |
409 3390 3156 1606 1606 3389 1950 3390 4 3390 3390 3390 3390 3390 3310 3310 3309 3310 3310 3129 3129 3129 3361 3361 3360 3309 3129 3127 658 658 658 658 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2016 Facebook * Copyright (C) 2013-2014 Jens Axboe */ #include <linux/sched.h> #include <linux/random.h> #include <linux/sbitmap.h> #include <linux/seq_file.h> static int init_alloc_hint(struct sbitmap *sb, gfp_t flags) { unsigned depth = sb->depth; sb->alloc_hint = alloc_percpu_gfp(unsigned int, flags); if (!sb->alloc_hint) return -ENOMEM; if (depth && !sb->round_robin) { int i; for_each_possible_cpu(i) *per_cpu_ptr(sb->alloc_hint, i) = get_random_u32_below(depth); } return 0; } static inline unsigned update_alloc_hint_before_get(struct sbitmap *sb, unsigned int depth) { unsigned hint; hint = this_cpu_read(*sb->alloc_hint); if (unlikely(hint >= depth)) { hint = depth ? get_random_u32_below(depth) : 0; this_cpu_write(*sb->alloc_hint, hint); } return hint; } static inline void update_alloc_hint_after_get(struct sbitmap *sb, unsigned int depth, unsigned int hint, unsigned int nr) { if (nr == -1) { /* If the map is full, a hint won't do us much good. */ this_cpu_write(*sb->alloc_hint, 0); } else if (nr == hint || unlikely(sb->round_robin)) { /* Only update the hint if we used it. */ hint = nr + 1; if (hint >= depth - 1) hint = 0; this_cpu_write(*sb->alloc_hint, hint); } } /* * See if we have deferred clears that we can batch move */ static inline bool sbitmap_deferred_clear(struct sbitmap_word *map) { unsigned long mask; if (!READ_ONCE(map->cleared)) return false; /* * First get a stable cleared mask, setting the old mask to 0. */ mask = xchg(&map->cleared, 0); /* * Now clear the masked bits in our free word */ atomic_long_andnot(mask, (atomic_long_t *)&map->word); BUILD_BUG_ON(sizeof(atomic_long_t) != sizeof(map->word)); return true; } int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift, gfp_t flags, int node, bool round_robin, bool alloc_hint) { unsigned int bits_per_word; if (shift < 0) shift = sbitmap_calculate_shift(depth); bits_per_word = 1U << shift; if (bits_per_word > BITS_PER_LONG) return -EINVAL; sb->shift = shift; sb->depth = depth; sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word); sb->round_robin = round_robin; if (depth == 0) { sb->map = NULL; return 0; } if (alloc_hint) { if (init_alloc_hint(sb, flags)) return -ENOMEM; } else { sb->alloc_hint = NULL; } sb->map = kvzalloc_node(sb->map_nr * sizeof(*sb->map), flags, node); if (!sb->map) { free_percpu(sb->alloc_hint); return -ENOMEM; } return 0; } EXPORT_SYMBOL_GPL(sbitmap_init_node); void sbitmap_resize(struct sbitmap *sb, unsigned int depth) { unsigned int bits_per_word = 1U << sb->shift; unsigned int i; for (i = 0; i < sb->map_nr; i++) sbitmap_deferred_clear(&sb->map[i]); sb->depth = depth; sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word); } EXPORT_SYMBOL_GPL(sbitmap_resize); static int __sbitmap_get_word(unsigned long *word, unsigned long depth, unsigned int hint, bool wrap) { int nr; /* don't wrap if starting from 0 */ wrap = wrap && hint; while (1) { nr = find_next_zero_bit(word, depth, hint); if (unlikely(nr >= depth)) { /* * We started with an offset, and we didn't reset the * offset to 0 in a failure case, so start from 0 to * exhaust the map. */ if (hint && wrap) { hint = 0; continue; } return -1; } if (!test_and_set_bit_lock(nr, word)) break; hint = nr + 1; if (hint >= depth - 1) hint = 0; } return nr; } static int sbitmap_find_bit_in_word(struct sbitmap_word *map, unsigned int depth, unsigned int alloc_hint, bool wrap) { int nr; do { nr = __sbitmap_get_word(&map->word, depth, alloc_hint, wrap); if (nr != -1) break; if (!sbitmap_deferred_clear(map)) break; } while (1); return nr; } static int sbitmap_find_bit(struct sbitmap *sb, unsigned int depth, unsigned int index, unsigned int alloc_hint, bool wrap) { unsigned int i; int nr = -1; for (i = 0; i < sb->map_nr; i++) { nr = sbitmap_find_bit_in_word(&sb->map[index], min_t(unsigned int, __map_depth(sb, index), depth), alloc_hint, wrap); if (nr != -1) { nr += index << sb->shift; break; } /* Jump to next index. */ alloc_hint = 0; if (++index >= sb->map_nr) index = 0; } return nr; } static int __sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint) { unsigned int index; index = SB_NR_TO_INDEX(sb, alloc_hint); /* * Unless we're doing round robin tag allocation, just use the * alloc_hint to find the right word index. No point in looping * twice in find_next_zero_bit() for that case. */ if (sb->round_robin) alloc_hint = SB_NR_TO_BIT(sb, alloc_hint); else alloc_hint = 0; return sbitmap_find_bit(sb, UINT_MAX, index, alloc_hint, !sb->round_robin); } int sbitmap_get(struct sbitmap *sb) { int nr; unsigned int hint, depth; if (WARN_ON_ONCE(unlikely(!sb->alloc_hint))) return -1; depth = READ_ONCE(sb->depth); hint = update_alloc_hint_before_get(sb, depth); nr = __sbitmap_get(sb, hint); update_alloc_hint_after_get(sb, depth, hint, nr); return nr; } EXPORT_SYMBOL_GPL(sbitmap_get); static int __sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint, unsigned long shallow_depth) { unsigned int index; index = SB_NR_TO_INDEX(sb, alloc_hint); alloc_hint = SB_NR_TO_BIT(sb, alloc_hint); return sbitmap_find_bit(sb, shallow_depth, index, alloc_hint, true); } int sbitmap_get_shallow(struct sbitmap *sb, unsigned long shallow_depth) { int nr; unsigned int hint, depth; if (WARN_ON_ONCE(unlikely(!sb->alloc_hint))) return -1; depth = READ_ONCE(sb->depth); hint = update_alloc_hint_before_get(sb, depth); nr = __sbitmap_get_shallow(sb, hint, shallow_depth); update_alloc_hint_after_get(sb, depth, hint, nr); return nr; } EXPORT_SYMBOL_GPL(sbitmap_get_shallow); bool sbitmap_any_bit_set(const struct sbitmap *sb) { unsigned int i; for (i = 0; i < sb->map_nr; i++) { if (sb->map[i].word & ~sb->map[i].cleared) return true; } return false; } EXPORT_SYMBOL_GPL(sbitmap_any_bit_set); static unsigned int __sbitmap_weight(const struct sbitmap *sb, bool set) { unsigned int i, weight = 0; for (i = 0; i < sb->map_nr; i++) { const struct sbitmap_word *word = &sb->map[i]; unsigned int word_depth = __map_depth(sb, i); if (set) weight += bitmap_weight(&word->word, word_depth); else weight += bitmap_weight(&word->cleared, word_depth); } return weight; } static unsigned int sbitmap_cleared(const struct sbitmap *sb) { return __sbitmap_weight(sb, false); } unsigned int sbitmap_weight(const struct sbitmap *sb) { return __sbitmap_weight(sb, true) - sbitmap_cleared(sb); } EXPORT_SYMBOL_GPL(sbitmap_weight); void sbitmap_show(struct sbitmap *sb, struct seq_file *m) { seq_printf(m, "depth=%u\n", sb->depth); seq_printf(m, "busy=%u\n", sbitmap_weight(sb)); seq_printf(m, "cleared=%u\n", sbitmap_cleared(sb)); seq_printf(m, "bits_per_word=%u\n", 1U << sb->shift); seq_printf(m, "map_nr=%u\n", sb->map_nr); } EXPORT_SYMBOL_GPL(sbitmap_show); static inline void emit_byte(struct seq_file *m, unsigned int offset, u8 byte) { if ((offset & 0xf) == 0) { if (offset != 0) seq_putc(m, '\n'); seq_printf(m, "%08x:", offset); } if ((offset & 0x1) == 0) seq_putc(m, ' '); seq_printf(m, "%02x", byte); } void sbitmap_bitmap_show(struct sbitmap *sb, struct seq_file *m) { u8 byte = 0; unsigned int byte_bits = 0; unsigned int offset = 0; int i; for (i = 0; i < sb->map_nr; i++) { unsigned long word = READ_ONCE(sb->map[i].word); unsigned long cleared = READ_ONCE(sb->map[i].cleared); unsigned int word_bits = __map_depth(sb, i); word &= ~cleared; while (word_bits > 0) { unsigned int bits = min(8 - byte_bits, word_bits); byte |= (word & (BIT(bits) - 1)) << byte_bits; byte_bits += bits; if (byte_bits == 8) { emit_byte(m, offset, byte); byte = 0; byte_bits = 0; offset++; } word >>= bits; word_bits -= bits; } } if (byte_bits) { emit_byte(m, offset, byte); offset++; } if (offset) seq_putc(m, '\n'); } EXPORT_SYMBOL_GPL(sbitmap_bitmap_show); static unsigned int sbq_calc_wake_batch(struct sbitmap_queue *sbq, unsigned int depth) { unsigned int wake_batch; unsigned int shallow_depth; /* * For each batch, we wake up one queue. We need to make sure that our * batch size is small enough that the full depth of the bitmap, * potentially limited by a shallow depth, is enough to wake up all of * the queues. * * Each full word of the bitmap has bits_per_word bits, and there might * be a partial word. There are depth / bits_per_word full words and * depth % bits_per_word bits left over. In bitwise arithmetic: * * bits_per_word = 1 << shift * depth / bits_per_word = depth >> shift * depth % bits_per_word = depth & ((1 << shift) - 1) * * Each word can be limited to sbq->min_shallow_depth bits. */ shallow_depth = min(1U << sbq->sb.shift, sbq->min_shallow_depth); depth = ((depth >> sbq->sb.shift) * shallow_depth + min(depth & ((1U << sbq->sb.shift) - 1), shallow_depth)); wake_batch = clamp_t(unsigned int, depth / SBQ_WAIT_QUEUES, 1, SBQ_WAKE_BATCH); return wake_batch; } int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, int shift, bool round_robin, gfp_t flags, int node) { int ret; int i; ret = sbitmap_init_node(&sbq->sb, depth, shift, flags, node, round_robin, true); if (ret) return ret; sbq->min_shallow_depth = UINT_MAX; sbq->wake_batch = sbq_calc_wake_batch(sbq, depth); atomic_set(&sbq->wake_index, 0); atomic_set(&sbq->ws_active, 0); atomic_set(&sbq->completion_cnt, 0); atomic_set(&sbq->wakeup_cnt, 0); sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node); if (!sbq->ws) { sbitmap_free(&sbq->sb); return -ENOMEM; } for (i = 0; i < SBQ_WAIT_QUEUES; i++) init_waitqueue_head(&sbq->ws[i].wait); return 0; } EXPORT_SYMBOL_GPL(sbitmap_queue_init_node); static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq, unsigned int depth) { unsigned int wake_batch; wake_batch = sbq_calc_wake_batch(sbq, depth); if (sbq->wake_batch != wake_batch) WRITE_ONCE(sbq->wake_batch, wake_batch); } void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq, unsigned int users) { unsigned int wake_batch; unsigned int depth = (sbq->sb.depth + users - 1) / users; wake_batch = clamp_val(depth / SBQ_WAIT_QUEUES, 1, SBQ_WAKE_BATCH); WRITE_ONCE(sbq->wake_batch, wake_batch); } EXPORT_SYMBOL_GPL(sbitmap_queue_recalculate_wake_batch); void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth) { sbitmap_queue_update_wake_batch(sbq, depth); sbitmap_resize(&sbq->sb, depth); } EXPORT_SYMBOL_GPL(sbitmap_queue_resize); int __sbitmap_queue_get(struct sbitmap_queue *sbq) { return sbitmap_get(&sbq->sb); } EXPORT_SYMBOL_GPL(__sbitmap_queue_get); unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags, unsigned int *offset) { struct sbitmap *sb = &sbq->sb; unsigned int hint, depth; unsigned long index, nr; int i; if (unlikely(sb->round_robin)) return 0; depth = READ_ONCE(sb->depth); hint = update_alloc_hint_before_get(sb, depth); index = SB_NR_TO_INDEX(sb, hint); for (i = 0; i < sb->map_nr; i++) { struct sbitmap_word *map = &sb->map[index]; unsigned long get_mask; unsigned int map_depth = __map_depth(sb, index); sbitmap_deferred_clear(map); if (map->word == (1UL << (map_depth - 1)) - 1) goto next; nr = find_first_zero_bit(&map->word, map_depth); if (nr + nr_tags <= map_depth) { atomic_long_t *ptr = (atomic_long_t *) &map->word; unsigned long val; get_mask = ((1UL << nr_tags) - 1) << nr; val = READ_ONCE(map->word); while (!atomic_long_try_cmpxchg(ptr, &val, get_mask | val)) ; get_mask = (get_mask & ~val) >> nr; if (get_mask) { *offset = nr + (index << sb->shift); update_alloc_hint_after_get(sb, depth, hint, *offset + nr_tags - 1); return get_mask; } } next: /* Jump to next index. */ if (++index >= sb->map_nr) index = 0; } return 0; } int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq, unsigned int shallow_depth) { WARN_ON_ONCE(shallow_depth < sbq->min_shallow_depth); return sbitmap_get_shallow(&sbq->sb, shallow_depth); } EXPORT_SYMBOL_GPL(sbitmap_queue_get_shallow); void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq, unsigned int min_shallow_depth) { sbq->min_shallow_depth = min_shallow_depth; sbitmap_queue_update_wake_batch(sbq, sbq->sb.depth); } EXPORT_SYMBOL_GPL(sbitmap_queue_min_shallow_depth); static void __sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr) { int i, wake_index, woken; if (!atomic_read(&sbq->ws_active)) return; wake_index = atomic_read(&sbq->wake_index); for (i = 0; i < SBQ_WAIT_QUEUES; i++) { struct sbq_wait_state *ws = &sbq->ws[wake_index]; /* * Advance the index before checking the current queue. * It improves fairness, by ensuring the queue doesn't * need to be fully emptied before trying to wake up * from the next one. */ wake_index = sbq_index_inc(wake_index); if (waitqueue_active(&ws->wait)) { woken = wake_up_nr(&ws->wait, nr); if (woken == nr) break; nr -= woken; } } if (wake_index != atomic_read(&sbq->wake_index)) atomic_set(&sbq->wake_index, wake_index); } void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr) { unsigned int wake_batch = READ_ONCE(sbq->wake_batch); unsigned int wakeups; if (!atomic_read(&sbq->ws_active)) return; atomic_add(nr, &sbq->completion_cnt); wakeups = atomic_read(&sbq->wakeup_cnt); do { if (atomic_read(&sbq->completion_cnt) - wakeups < wake_batch) return; } while (!atomic_try_cmpxchg(&sbq->wakeup_cnt, &wakeups, wakeups + wake_batch)); __sbitmap_queue_wake_up(sbq, wake_batch); } EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up); static inline void sbitmap_update_cpu_hint(struct sbitmap *sb, int cpu, int tag) { if (likely(!sb->round_robin && tag < sb->depth)) data_race(*per_cpu_ptr(sb->alloc_hint, cpu) = tag); } void sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, int offset, int *tags, int nr_tags) { struct sbitmap *sb = &sbq->sb; unsigned long *addr = NULL; unsigned long mask = 0; int i; smp_mb__before_atomic(); for (i = 0; i < nr_tags; i++) { const int tag = tags[i] - offset; unsigned long *this_addr; /* since we're clearing a batch, skip the deferred map */ this_addr = &sb->map[SB_NR_TO_INDEX(sb, tag)].word; if (!addr) { addr = this_addr; } else if (addr != this_addr) { atomic_long_andnot(mask, (atomic_long_t *) addr); mask = 0; addr = this_addr; } mask |= (1UL << SB_NR_TO_BIT(sb, tag)); } if (mask) atomic_long_andnot(mask, (atomic_long_t *) addr); smp_mb__after_atomic(); sbitmap_queue_wake_up(sbq, nr_tags); sbitmap_update_cpu_hint(&sbq->sb, raw_smp_processor_id(), tags[nr_tags - 1] - offset); } void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, unsigned int cpu) { /* * Once the clear bit is set, the bit may be allocated out. * * Orders READ/WRITE on the associated instance(such as request * of blk_mq) by this bit for avoiding race with re-allocation, * and its pair is the memory barrier implied in __sbitmap_get_word. * * One invariant is that the clear bit has to be zero when the bit * is in use. */ smp_mb__before_atomic(); sbitmap_deferred_clear_bit(&sbq->sb, nr); /* * Pairs with the memory barrier in set_current_state() to ensure the * proper ordering of clear_bit_unlock()/waitqueue_active() in the waker * and test_and_set_bit_lock()/prepare_to_wait()/finish_wait() in the * waiter. See the comment on waitqueue_active(). */ smp_mb__after_atomic(); sbitmap_queue_wake_up(sbq, 1); sbitmap_update_cpu_hint(&sbq->sb, cpu, nr); } EXPORT_SYMBOL_GPL(sbitmap_queue_clear); void sbitmap_queue_wake_all(struct sbitmap_queue *sbq) { int i, wake_index; /* * Pairs with the memory barrier in set_current_state() like in * sbitmap_queue_wake_up(). */ smp_mb(); wake_index = atomic_read(&sbq->wake_index); for (i = 0; i < SBQ_WAIT_QUEUES; i++) { struct sbq_wait_state *ws = &sbq->ws[wake_index]; if (waitqueue_active(&ws->wait)) wake_up(&ws->wait); wake_index = sbq_index_inc(wake_index); } } EXPORT_SYMBOL_GPL(sbitmap_queue_wake_all); void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m) { bool first; int i; sbitmap_show(&sbq->sb, m); seq_puts(m, "alloc_hint={"); first = true; for_each_possible_cpu(i) { if (!first) seq_puts(m, ", "); first = false; seq_printf(m, "%u", *per_cpu_ptr(sbq->sb.alloc_hint, i)); } seq_puts(m, "}\n"); seq_printf(m, "wake_batch=%u\n", sbq->wake_batch); seq_printf(m, "wake_index=%d\n", atomic_read(&sbq->wake_index)); seq_printf(m, "ws_active=%d\n", atomic_read(&sbq->ws_active)); seq_puts(m, "ws={\n"); for (i = 0; i < SBQ_WAIT_QUEUES; i++) { struct sbq_wait_state *ws = &sbq->ws[i]; seq_printf(m, "\t{.wait=%s},\n", waitqueue_active(&ws->wait) ? "active" : "inactive"); } seq_puts(m, "}\n"); seq_printf(m, "round_robin=%d\n", sbq->sb.round_robin); seq_printf(m, "min_shallow_depth=%u\n", sbq->min_shallow_depth); } EXPORT_SYMBOL_GPL(sbitmap_queue_show); void sbitmap_add_wait_queue(struct sbitmap_queue *sbq, struct sbq_wait_state *ws, struct sbq_wait *sbq_wait) { if (!sbq_wait->sbq) { sbq_wait->sbq = sbq; atomic_inc(&sbq->ws_active); add_wait_queue(&ws->wait, &sbq_wait->wait); } } EXPORT_SYMBOL_GPL(sbitmap_add_wait_queue); void sbitmap_del_wait_queue(struct sbq_wait *sbq_wait) { list_del_init(&sbq_wait->wait.entry); if (sbq_wait->sbq) { atomic_dec(&sbq_wait->sbq->ws_active); sbq_wait->sbq = NULL; } } EXPORT_SYMBOL_GPL(sbitmap_del_wait_queue); void sbitmap_prepare_to_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws, struct sbq_wait *sbq_wait, int state) { if (!sbq_wait->sbq) { atomic_inc(&sbq->ws_active); sbq_wait->sbq = sbq; } prepare_to_wait_exclusive(&ws->wait, &sbq_wait->wait, state); } EXPORT_SYMBOL_GPL(sbitmap_prepare_to_wait); void sbitmap_finish_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws, struct sbq_wait *sbq_wait) { finish_wait(&ws->wait, &sbq_wait->wait); if (sbq_wait->sbq) { atomic_dec(&sbq->ws_active); sbq_wait->sbq = NULL; } } EXPORT_SYMBOL_GPL(sbitmap_finish_wait); |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 | // SPDX-License-Identifier: GPL-2.0-only /* * This file provides /sys/class/ieee80211/<wiphy name>/ * and some default attributes. * * Copyright 2005-2006 Jiri Benc <jbenc@suse.cz> * Copyright 2006 Johannes Berg <johannes@sipsolutions.net> * Copyright (C) 2020-2021, 2023 Intel Corporation */ #include <linux/device.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/nl80211.h> #include <linux/rtnetlink.h> #include <net/cfg80211.h> #include "sysfs.h" #include "core.h" #include "rdev-ops.h" static inline struct cfg80211_registered_device *dev_to_rdev( struct device *dev) { return container_of(dev, struct cfg80211_registered_device, wiphy.dev); } #define SHOW_FMT(name, fmt, member) \ static ssize_t name ## _show(struct device *dev, \ struct device_attribute *attr, \ char *buf) \ { \ return sprintf(buf, fmt "\n", dev_to_rdev(dev)->member); \ } \ static DEVICE_ATTR_RO(name) SHOW_FMT(index, "%d", wiphy_idx); SHOW_FMT(macaddress, "%pM", wiphy.perm_addr); SHOW_FMT(address_mask, "%pM", wiphy.addr_mask); static ssize_t name_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wiphy *wiphy = &dev_to_rdev(dev)->wiphy; return sprintf(buf, "%s\n", wiphy_name(wiphy)); } static DEVICE_ATTR_RO(name); static ssize_t addresses_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wiphy *wiphy = &dev_to_rdev(dev)->wiphy; char *start = buf; int i; if (!wiphy->addresses) return sprintf(buf, "%pM\n", wiphy->perm_addr); for (i = 0; i < wiphy->n_addresses; i++) buf += sprintf(buf, "%pM\n", wiphy->addresses[i].addr); return buf - start; } static DEVICE_ATTR_RO(addresses); static struct attribute *ieee80211_attrs[] = { &dev_attr_index.attr, &dev_attr_macaddress.attr, &dev_attr_address_mask.attr, &dev_attr_addresses.attr, &dev_attr_name.attr, NULL, }; ATTRIBUTE_GROUPS(ieee80211); static void wiphy_dev_release(struct device *dev) { struct cfg80211_registered_device *rdev = dev_to_rdev(dev); cfg80211_dev_free(rdev); } #ifdef CONFIG_PM_SLEEP static void cfg80211_leave_all(struct cfg80211_registered_device *rdev) { struct wireless_dev *wdev; list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) cfg80211_leave(rdev, wdev); } static int wiphy_suspend(struct device *dev) { struct cfg80211_registered_device *rdev = dev_to_rdev(dev); int ret = 0; rdev->suspend_at = ktime_get_boottime_seconds(); rtnl_lock(); wiphy_lock(&rdev->wiphy); if (rdev->wiphy.registered) { if (!rdev->wiphy.wowlan_config) { cfg80211_leave_all(rdev); cfg80211_process_rdev_events(rdev); } cfg80211_process_wiphy_works(rdev); if (rdev->ops->suspend) ret = rdev_suspend(rdev, rdev->wiphy.wowlan_config); if (ret == 1) { /* Driver refuse to configure wowlan */ cfg80211_leave_all(rdev); cfg80211_process_rdev_events(rdev); cfg80211_process_wiphy_works(rdev); ret = rdev_suspend(rdev, NULL); } if (ret == 0) rdev->suspended = true; } wiphy_unlock(&rdev->wiphy); rtnl_unlock(); return ret; } static int wiphy_resume(struct device *dev) { struct cfg80211_registered_device *rdev = dev_to_rdev(dev); int ret = 0; /* Age scan results with time spent in suspend */ cfg80211_bss_age(rdev, ktime_get_boottime_seconds() - rdev->suspend_at); rtnl_lock(); wiphy_lock(&rdev->wiphy); if (rdev->wiphy.registered && rdev->ops->resume) ret = rdev_resume(rdev); rdev->suspended = false; schedule_work(&rdev->wiphy_work); wiphy_unlock(&rdev->wiphy); if (ret) cfg80211_shutdown_all_interfaces(&rdev->wiphy); rtnl_unlock(); return ret; } static SIMPLE_DEV_PM_OPS(wiphy_pm_ops, wiphy_suspend, wiphy_resume); #define WIPHY_PM_OPS (&wiphy_pm_ops) #else #define WIPHY_PM_OPS NULL #endif static const void *wiphy_namespace(const struct device *d) { struct wiphy *wiphy = container_of(d, struct wiphy, dev); return wiphy_net(wiphy); } struct class ieee80211_class = { .name = "ieee80211", .dev_release = wiphy_dev_release, .dev_groups = ieee80211_groups, .pm = WIPHY_PM_OPS, .ns_type = &net_ns_type_operations, .namespace = wiphy_namespace, }; int wiphy_sysfs_init(void) { return class_register(&ieee80211_class); } void wiphy_sysfs_exit(void) { class_unregister(&ieee80211_class); } |
197 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NET_NF_TABLES_H #define _NET_NF_TABLES_H #include <asm/unaligned.h> #include <linux/list.h> #include <linux/netfilter.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/nf_tables.h> #include <linux/u64_stats_sync.h> #include <linux/rhashtable.h> #include <net/netfilter/nf_flow_table.h> #include <net/netlink.h> #include <net/flow_offload.h> #include <net/netns/generic.h> #define NFT_MAX_HOOKS (NF_INET_INGRESS + 1) struct module; #define NFT_JUMP_STACK_SIZE 16 enum { NFT_PKTINFO_L4PROTO = (1 << 0), NFT_PKTINFO_INNER = (1 << 1), NFT_PKTINFO_INNER_FULL = (1 << 2), }; struct nft_pktinfo { struct sk_buff *skb; const struct nf_hook_state *state; u8 flags; u8 tprot; u16 fragoff; u16 thoff; u16 inneroff; }; static inline struct sock *nft_sk(const struct nft_pktinfo *pkt) { return pkt->state->sk; } static inline unsigned int nft_thoff(const struct nft_pktinfo *pkt) { return pkt->thoff; } static inline struct net *nft_net(const struct nft_pktinfo *pkt) { return pkt->state->net; } static inline unsigned int nft_hook(const struct nft_pktinfo *pkt) { return pkt->state->hook; } static inline u8 nft_pf(const struct nft_pktinfo *pkt) { return pkt->state->pf; } static inline const struct net_device *nft_in(const struct nft_pktinfo *pkt) { return pkt->state->in; } static inline const struct net_device *nft_out(const struct nft_pktinfo *pkt) { return pkt->state->out; } static inline void nft_set_pktinfo(struct nft_pktinfo *pkt, struct sk_buff *skb, const struct nf_hook_state *state) { pkt->skb = skb; pkt->state = state; } static inline void nft_set_pktinfo_unspec(struct nft_pktinfo *pkt) { pkt->flags = 0; pkt->tprot = 0; pkt->thoff = 0; pkt->fragoff = 0; } /** * struct nft_verdict - nf_tables verdict * * @code: nf_tables/netfilter verdict code * @chain: destination chain for NFT_JUMP/NFT_GOTO */ struct nft_verdict { u32 code; struct nft_chain *chain; }; struct nft_data { union { u32 data[4]; struct nft_verdict verdict; }; } __attribute__((aligned(__alignof__(u64)))); #define NFT_REG32_NUM 20 /** * struct nft_regs - nf_tables register set * * @data: data registers * @verdict: verdict register * * The first four data registers alias to the verdict register. */ struct nft_regs { union { u32 data[NFT_REG32_NUM]; struct nft_verdict verdict; }; }; struct nft_regs_track { struct { const struct nft_expr *selector; const struct nft_expr *bitwise; u8 num_reg; } regs[NFT_REG32_NUM]; const struct nft_expr *cur; const struct nft_expr *last; }; /* Store/load an u8, u16 or u64 integer to/from the u32 data register. * * Note, when using concatenations, register allocation happens at 32-bit * level. So for store instruction, pad the rest part with zero to avoid * garbage values. */ static inline void nft_reg_store8(u32 *dreg, u8 val) { *dreg = 0; *(u8 *)dreg = val; } static inline u8 nft_reg_load8(const u32 *sreg) { return *(u8 *)sreg; } static inline void nft_reg_store16(u32 *dreg, u16 val) { *dreg = 0; *(u16 *)dreg = val; } static inline void nft_reg_store_be16(u32 *dreg, __be16 val) { nft_reg_store16(dreg, (__force __u16)val); } static inline u16 nft_reg_load16(const u32 *sreg) { return *(u16 *)sreg; } static inline __be16 nft_reg_load_be16(const u32 *sreg) { return (__force __be16)nft_reg_load16(sreg); } static inline __be32 nft_reg_load_be32(const u32 *sreg) { return *(__force __be32 *)sreg; } static inline void nft_reg_store64(u32 *dreg, u64 val) { put_unaligned(val, (u64 *)dreg); } static inline u64 nft_reg_load64(const u32 *sreg) { return get_unaligned((u64 *)sreg); } static inline void nft_data_copy(u32 *dst, const struct nft_data *src, unsigned int len) { if (len % NFT_REG32_SIZE) dst[len / NFT_REG32_SIZE] = 0; memcpy(dst, src, len); } /** * struct nft_ctx - nf_tables rule/set context * * @net: net namespace * @table: the table the chain is contained in * @chain: the chain the rule is contained in * @nla: netlink attributes * @portid: netlink portID of the original message * @seq: netlink sequence number * @family: protocol family * @level: depth of the chains * @report: notify via unicast netlink message */ struct nft_ctx { struct net *net; struct nft_table *table; struct nft_chain *chain; const struct nlattr * const *nla; u32 portid; u32 seq; u16 flags; u8 family; u8 level; bool report; }; enum nft_data_desc_flags { NFT_DATA_DESC_SETELEM = (1 << 0), }; struct nft_data_desc { enum nft_data_types type; unsigned int size; unsigned int len; unsigned int flags; }; int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data, struct nft_data_desc *desc, const struct nlattr *nla); void nft_data_hold(const struct nft_data *data, enum nft_data_types type); void nft_data_release(const struct nft_data *data, enum nft_data_types type); int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data, enum nft_data_types type, unsigned int len); static inline enum nft_data_types nft_dreg_to_type(enum nft_registers reg) { return reg == NFT_REG_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE; } static inline enum nft_registers nft_type_to_reg(enum nft_data_types type) { return type == NFT_DATA_VERDICT ? NFT_REG_VERDICT : NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE; } int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest); int nft_dump_register(struct sk_buff *skb, unsigned int attr, unsigned int reg); int nft_parse_register_load(const struct nlattr *attr, u8 *sreg, u32 len); int nft_parse_register_store(const struct nft_ctx *ctx, const struct nlattr *attr, u8 *dreg, const struct nft_data *data, enum nft_data_types type, unsigned int len); /** * struct nft_userdata - user defined data associated with an object * * @len: length of the data * @data: content * * The presence of user data is indicated in an object specific fashion, * so a length of zero can't occur and the value "len" indicates data * of length len + 1. */ struct nft_userdata { u8 len; unsigned char data[]; }; /** * struct nft_set_elem - generic representation of set elements * * @key: element key * @key_end: closing element key * @priv: element private data and extensions */ struct nft_set_elem { union { u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)]; struct nft_data val; } key; union { u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)]; struct nft_data val; } key_end; union { u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)]; struct nft_data val; } data; void *priv; }; struct nft_set; struct nft_set_iter { u8 genmask; unsigned int count; unsigned int skip; int err; int (*fn)(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, struct nft_set_elem *elem); }; /** * struct nft_set_desc - description of set elements * * @ktype: key type * @klen: key length * @dtype: data type * @dlen: data length * @objtype: object type * @flags: flags * @size: number of set elements * @policy: set policy * @gc_int: garbage collector interval * @field_len: length of each field in concatenation, bytes * @field_count: number of concatenated fields in element * @expr: set must support for expressions */ struct nft_set_desc { u32 ktype; unsigned int klen; u32 dtype; unsigned int dlen; u32 objtype; unsigned int size; u32 policy; u32 gc_int; u64 timeout; u8 field_len[NFT_REG32_COUNT]; u8 field_count; bool expr; }; /** * enum nft_set_class - performance class * * @NFT_LOOKUP_O_1: constant, O(1) * @NFT_LOOKUP_O_LOG_N: logarithmic, O(log N) * @NFT_LOOKUP_O_N: linear, O(N) */ enum nft_set_class { NFT_SET_CLASS_O_1, NFT_SET_CLASS_O_LOG_N, NFT_SET_CLASS_O_N, }; /** * struct nft_set_estimate - estimation of memory and performance * characteristics * * @size: required memory * @lookup: lookup performance class * @space: memory class */ struct nft_set_estimate { u64 size; enum nft_set_class lookup; enum nft_set_class space; }; #define NFT_EXPR_MAXATTR 16 #define NFT_EXPR_SIZE(size) (sizeof(struct nft_expr) + \ ALIGN(size, __alignof__(struct nft_expr))) /** * struct nft_expr - nf_tables expression * * @ops: expression ops * @data: expression private data */ struct nft_expr { const struct nft_expr_ops *ops; unsigned char data[] __attribute__((aligned(__alignof__(u64)))); }; static inline void *nft_expr_priv(const struct nft_expr *expr) { return (void *)expr->data; } struct nft_expr_info; int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla, struct nft_expr_info *info); int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src); void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr); int nft_expr_dump(struct sk_buff *skb, unsigned int attr, const struct nft_expr *expr, bool reset); bool nft_expr_reduce_bitwise(struct nft_regs_track *track, const struct nft_expr *expr); struct nft_set_ext; /** * struct nft_set_ops - nf_tables set operations * * @lookup: look up an element within the set * @update: update an element if exists, add it if doesn't exist * @delete: delete an element * @insert: insert new element into set * @activate: activate new element in the next generation * @deactivate: lookup for element and deactivate it in the next generation * @flush: deactivate element in the next generation * @remove: remove element from set * @walk: iterate over all set elements * @get: get set elements * @privsize: function to return size of set private data * @init: initialize private data of new set instance * @destroy: destroy private data of set instance * @elemsize: element private size * * Operations lookup, update and delete have simpler interfaces, are faster * and currently only used in the packet path. All the rest are slower, * control plane functions. */ struct nft_set_ops { bool (*lookup)(const struct net *net, const struct nft_set *set, const u32 *key, const struct nft_set_ext **ext); bool (*update)(struct nft_set *set, const u32 *key, void *(*new)(struct nft_set *, const struct nft_expr *, struct nft_regs *), const struct nft_expr *expr, struct nft_regs *regs, const struct nft_set_ext **ext); bool (*delete)(const struct nft_set *set, const u32 *key); int (*insert)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, struct nft_set_ext **ext); void (*activate)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem); void * (*deactivate)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem); bool (*flush)(const struct net *net, const struct nft_set *set, void *priv); void (*remove)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem); void (*walk)(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_iter *iter); void * (*get)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, unsigned int flags); void (*commit)(const struct nft_set *set); void (*abort)(const struct nft_set *set); u64 (*privsize)(const struct nlattr * const nla[], const struct nft_set_desc *desc); bool (*estimate)(const struct nft_set_desc *desc, u32 features, struct nft_set_estimate *est); int (*init)(const struct nft_set *set, const struct nft_set_desc *desc, const struct nlattr * const nla[]); void (*destroy)(const struct nft_ctx *ctx, const struct nft_set *set); void (*gc_init)(const struct nft_set *set); unsigned int elemsize; }; /** * struct nft_set_type - nf_tables set type * * @ops: set ops for this type * @features: features supported by the implementation */ struct nft_set_type { const struct nft_set_ops ops; u32 features; }; #define to_set_type(o) container_of(o, struct nft_set_type, ops) struct nft_set_elem_expr { u8 size; unsigned char data[] __attribute__((aligned(__alignof__(struct nft_expr)))); }; #define nft_setelem_expr_at(__elem_expr, __offset) \ ((struct nft_expr *)&__elem_expr->data[__offset]) #define nft_setelem_expr_foreach(__expr, __elem_expr, __size) \ for (__expr = nft_setelem_expr_at(__elem_expr, 0), __size = 0; \ __size < (__elem_expr)->size; \ __size += (__expr)->ops->size, __expr = ((void *)(__expr)) + (__expr)->ops->size) #define NFT_SET_EXPR_MAX 2 /** * struct nft_set - nf_tables set instance * * @list: table set list node * @bindings: list of set bindings * @refs: internal refcounting for async set destruction * @table: table this set belongs to * @net: netnamespace this set belongs to * @name: name of the set * @handle: unique handle of the set * @ktype: key type (numeric type defined by userspace, not used in the kernel) * @dtype: data type (verdict or numeric type defined by userspace) * @objtype: object type (see NFT_OBJECT_* definitions) * @size: maximum set size * @field_len: length of each field in concatenation, bytes * @field_count: number of concatenated fields in element * @use: number of rules references to this set * @nelems: number of elements * @ndeact: number of deactivated elements queued for removal * @timeout: default timeout value in jiffies * @gc_int: garbage collection interval in msecs * @policy: set parameterization (see enum nft_set_policies) * @udlen: user data length * @udata: user data * @expr: stateful expression * @ops: set ops * @flags: set flags * @dead: set will be freed, never cleared * @genmask: generation mask * @klen: key length * @dlen: data length * @data: private set data */ struct nft_set { struct list_head list; struct list_head bindings; refcount_t refs; struct nft_table *table; possible_net_t net; char *name; u64 handle; u32 ktype; u32 dtype; u32 objtype; u32 size; u8 field_len[NFT_REG32_COUNT]; u8 field_count; u32 use; atomic_t nelems; u32 ndeact; u64 timeout; u32 gc_int; u16 policy; u16 udlen; unsigned char *udata; struct list_head pending_update; /* runtime data below here */ const struct nft_set_ops *ops ____cacheline_aligned; u16 flags:13, dead:1, genmask:2; u8 klen; u8 dlen; u8 num_exprs; struct nft_expr *exprs[NFT_SET_EXPR_MAX]; struct list_head catchall_list; unsigned char data[] __attribute__((aligned(__alignof__(u64)))); }; static inline bool nft_set_is_anonymous(const struct nft_set *set) { return set->flags & NFT_SET_ANONYMOUS; } static inline void *nft_set_priv(const struct nft_set *set) { return (void *)set->data; } static inline bool nft_set_gc_is_pending(const struct nft_set *s) { return refcount_read(&s->refs) != 1; } static inline struct nft_set *nft_set_container_of(const void *priv) { return (void *)priv - offsetof(struct nft_set, data); } struct nft_set *nft_set_lookup_global(const struct net *net, const struct nft_table *table, const struct nlattr *nla_set_name, const struct nlattr *nla_set_id, u8 genmask); struct nft_set_ext *nft_set_catchall_lookup(const struct net *net, const struct nft_set *set); static inline unsigned long nft_set_gc_interval(const struct nft_set *set) { u32 gc_int = READ_ONCE(set->gc_int); return gc_int ? msecs_to_jiffies(gc_int) : HZ; } /** * struct nft_set_binding - nf_tables set binding * * @list: set bindings list node * @chain: chain containing the rule bound to the set * @flags: set action flags * * A set binding contains all information necessary for validation * of new elements added to a bound set. */ struct nft_set_binding { struct list_head list; const struct nft_chain *chain; u32 flags; }; enum nft_trans_phase; void nf_tables_activate_set(const struct nft_ctx *ctx, struct nft_set *set); void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding, enum nft_trans_phase phase); int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding); void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set); /** * enum nft_set_extensions - set extension type IDs * * @NFT_SET_EXT_KEY: element key * @NFT_SET_EXT_KEY_END: upper bound element key, for ranges * @NFT_SET_EXT_DATA: mapping data * @NFT_SET_EXT_FLAGS: element flags * @NFT_SET_EXT_TIMEOUT: element timeout * @NFT_SET_EXT_EXPIRATION: element expiration time * @NFT_SET_EXT_USERDATA: user data associated with the element * @NFT_SET_EXT_EXPRESSIONS: expressions assiciated with the element * @NFT_SET_EXT_OBJREF: stateful object reference associated with element * @NFT_SET_EXT_NUM: number of extension types */ enum nft_set_extensions { NFT_SET_EXT_KEY, NFT_SET_EXT_KEY_END, NFT_SET_EXT_DATA, NFT_SET_EXT_FLAGS, NFT_SET_EXT_TIMEOUT, NFT_SET_EXT_EXPIRATION, NFT_SET_EXT_USERDATA, NFT_SET_EXT_EXPRESSIONS, NFT_SET_EXT_OBJREF, NFT_SET_EXT_NUM }; /** * struct nft_set_ext_type - set extension type * * @len: fixed part length of the extension * @align: alignment requirements of the extension */ struct nft_set_ext_type { u8 len; u8 align; }; extern const struct nft_set_ext_type nft_set_ext_types[]; /** * struct nft_set_ext_tmpl - set extension template * * @len: length of extension area * @offset: offsets of individual extension types */ struct nft_set_ext_tmpl { u16 len; u8 offset[NFT_SET_EXT_NUM]; u8 ext_len[NFT_SET_EXT_NUM]; }; /** * struct nft_set_ext - set extensions * * @genmask: generation mask * @offset: offsets of individual extension types * @data: beginning of extension data */ struct nft_set_ext { u8 genmask; u8 offset[NFT_SET_EXT_NUM]; char data[]; }; static inline void nft_set_ext_prepare(struct nft_set_ext_tmpl *tmpl) { memset(tmpl, 0, sizeof(*tmpl)); tmpl->len = sizeof(struct nft_set_ext); } static inline int nft_set_ext_add_length(struct nft_set_ext_tmpl *tmpl, u8 id, unsigned int len) { tmpl->len = ALIGN(tmpl->len, nft_set_ext_types[id].align); if (tmpl->len > U8_MAX) return -EINVAL; tmpl->offset[id] = tmpl->len; tmpl->ext_len[id] = nft_set_ext_types[id].len + len; tmpl->len += tmpl->ext_len[id]; return 0; } static inline int nft_set_ext_add(struct nft_set_ext_tmpl *tmpl, u8 id) { return nft_set_ext_add_length(tmpl, id, 0); } static inline void nft_set_ext_init(struct nft_set_ext *ext, const struct nft_set_ext_tmpl *tmpl) { memcpy(ext->offset, tmpl->offset, sizeof(ext->offset)); } static inline bool __nft_set_ext_exists(const struct nft_set_ext *ext, u8 id) { return !!ext->offset[id]; } static inline bool nft_set_ext_exists(const struct nft_set_ext *ext, u8 id) { return ext && __nft_set_ext_exists(ext, id); } static inline void *nft_set_ext(const struct nft_set_ext *ext, u8 id) { return (void *)ext + ext->offset[id]; } static inline struct nft_data *nft_set_ext_key(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_KEY); } static inline struct nft_data *nft_set_ext_key_end(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_KEY_END); } static inline struct nft_data *nft_set_ext_data(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_DATA); } static inline u8 *nft_set_ext_flags(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_FLAGS); } static inline u64 *nft_set_ext_timeout(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_TIMEOUT); } static inline u64 *nft_set_ext_expiration(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_EXPIRATION); } static inline struct nft_userdata *nft_set_ext_userdata(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_USERDATA); } static inline struct nft_set_elem_expr *nft_set_ext_expr(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_EXPRESSIONS); } static inline bool nft_set_elem_expired(const struct nft_set_ext *ext) { return nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION) && time_is_before_eq_jiffies64(*nft_set_ext_expiration(ext)); } static inline struct nft_set_ext *nft_set_elem_ext(const struct nft_set *set, void *elem) { return elem + set->ops->elemsize; } static inline struct nft_object **nft_set_ext_obj(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_OBJREF); } struct nft_expr *nft_set_elem_expr_alloc(const struct nft_ctx *ctx, const struct nft_set *set, const struct nlattr *attr); void *nft_set_elem_init(const struct nft_set *set, const struct nft_set_ext_tmpl *tmpl, const u32 *key, const u32 *key_end, const u32 *data, u64 timeout, u64 expiration, gfp_t gfp); int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set, struct nft_expr *expr_array[]); void nft_set_elem_destroy(const struct nft_set *set, void *elem, bool destroy_expr); void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, const struct nft_set *set, void *elem); struct nft_expr_ops; /** * struct nft_expr_type - nf_tables expression type * * @select_ops: function to select nft_expr_ops * @release_ops: release nft_expr_ops * @ops: default ops, used when no select_ops functions is present * @list: used internally * @name: Identifier * @owner: module reference * @policy: netlink attribute policy * @maxattr: highest netlink attribute number * @family: address family for AF-specific types * @flags: expression type flags */ struct nft_expr_type { const struct nft_expr_ops *(*select_ops)(const struct nft_ctx *, const struct nlattr * const tb[]); void (*release_ops)(const struct nft_expr_ops *ops); const struct nft_expr_ops *ops; const struct nft_expr_ops *inner_ops; struct list_head list; const char *name; struct module *owner; const struct nla_policy *policy; unsigned int maxattr; u8 family; u8 flags; }; #define NFT_EXPR_STATEFUL 0x1 #define NFT_EXPR_GC 0x2 enum nft_trans_phase { NFT_TRANS_PREPARE, NFT_TRANS_PREPARE_ERROR, NFT_TRANS_ABORT, NFT_TRANS_COMMIT, NFT_TRANS_RELEASE }; struct nft_flow_rule; struct nft_offload_ctx; /** * struct nft_expr_ops - nf_tables expression operations * * @eval: Expression evaluation function * @size: full expression size, including private data size * @init: initialization function * @activate: activate expression in the next generation * @deactivate: deactivate expression in next generation * @destroy: destruction function, called after synchronize_rcu * @dump: function to dump parameters * @type: expression type * @validate: validate expression, called during loop detection * @data: extra data to attach to this expression operation */ struct nft_expr_ops { void (*eval)(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt); int (*clone)(struct nft_expr *dst, const struct nft_expr *src); unsigned int size; int (*init)(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]); void (*activate)(const struct nft_ctx *ctx, const struct nft_expr *expr); void (*deactivate)(const struct nft_ctx *ctx, const struct nft_expr *expr, enum nft_trans_phase phase); void (*destroy)(const struct nft_ctx *ctx, const struct nft_expr *expr); void (*destroy_clone)(const struct nft_ctx *ctx, const struct nft_expr *expr); int (*dump)(struct sk_buff *skb, const struct nft_expr *expr, bool reset); int (*validate)(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data); bool (*reduce)(struct nft_regs_track *track, const struct nft_expr *expr); bool (*gc)(struct net *net, const struct nft_expr *expr); int (*offload)(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_expr *expr); bool (*offload_action)(const struct nft_expr *expr); void (*offload_stats)(struct nft_expr *expr, const struct flow_stats *stats); const struct nft_expr_type *type; void *data; }; /** * struct nft_rule - nf_tables rule * * @list: used internally * @handle: rule handle * @genmask: generation mask * @dlen: length of expression data * @udata: user data is appended to the rule * @data: expression data */ struct nft_rule { struct list_head list; u64 handle:42, genmask:2, dlen:12, udata:1; unsigned char data[] __attribute__((aligned(__alignof__(struct nft_expr)))); }; static inline struct nft_expr *nft_expr_first(const struct nft_rule *rule) { return (struct nft_expr *)&rule->data[0]; } static inline struct nft_expr *nft_expr_next(const struct nft_expr *expr) { return ((void *)expr) + expr->ops->size; } static inline struct nft_expr *nft_expr_last(const struct nft_rule *rule) { return (struct nft_expr *)&rule->data[rule->dlen]; } static inline bool nft_expr_more(const struct nft_rule *rule, const struct nft_expr *expr) { return expr != nft_expr_last(rule) && expr->ops; } static inline struct nft_userdata *nft_userdata(const struct nft_rule *rule) { return (void *)&rule->data[rule->dlen]; } void nft_rule_expr_activate(const struct nft_ctx *ctx, struct nft_rule *rule); void nft_rule_expr_deactivate(const struct nft_ctx *ctx, struct nft_rule *rule, enum nft_trans_phase phase); void nf_tables_rule_destroy(const struct nft_ctx *ctx, struct nft_rule *rule); static inline void nft_set_elem_update_expr(const struct nft_set_ext *ext, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_set_elem_expr *elem_expr; struct nft_expr *expr; u32 size; if (__nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS)) { elem_expr = nft_set_ext_expr(ext); nft_setelem_expr_foreach(expr, elem_expr, size) { expr->ops->eval(expr, regs, pkt); if (regs->verdict.code == NFT_BREAK) return; } } } /* * The last pointer isn't really necessary, but the compiler isn't able to * determine that the result of nft_expr_last() is always the same since it * can't assume that the dlen value wasn't changed within calls in the loop. */ #define nft_rule_for_each_expr(expr, last, rule) \ for ((expr) = nft_expr_first(rule), (last) = nft_expr_last(rule); \ (expr) != (last); \ (expr) = nft_expr_next(expr)) #define NFT_CHAIN_POLICY_UNSET U8_MAX struct nft_rule_dp { u64 is_last:1, dlen:12, handle:42; /* for tracing */ unsigned char data[] __attribute__((aligned(__alignof__(struct nft_expr)))); }; struct nft_rule_dp_last { struct nft_rule_dp end; /* end of nft_rule_blob marker */ struct rcu_head h; /* call_rcu head */ struct nft_rule_blob *blob; /* ptr to free via call_rcu */ const struct nft_chain *chain; /* for nftables tracing */ }; static inline const struct nft_rule_dp *nft_rule_next(const struct nft_rule_dp *rule) { return (void *)rule + sizeof(*rule) + rule->dlen; } struct nft_rule_blob { unsigned long size; unsigned char data[] __attribute__((aligned(__alignof__(struct nft_rule_dp)))); }; /** * struct nft_chain - nf_tables chain * * @rules: list of rules in the chain * @list: used internally * @rhlhead: used internally * @table: table that this chain belongs to * @handle: chain handle * @use: number of jump references to this chain * @flags: bitmask of enum nft_chain_flags * @name: name of the chain */ struct nft_chain { struct nft_rule_blob __rcu *blob_gen_0; struct nft_rule_blob __rcu *blob_gen_1; struct list_head rules; struct list_head list; struct rhlist_head rhlhead; struct nft_table *table; u64 handle; u32 use; u8 flags:5, bound:1, genmask:2; char *name; u16 udlen; u8 *udata; /* Only used during control plane commit phase: */ struct nft_rule_blob *blob_next; }; int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain); int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, struct nft_set_elem *elem); int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set); int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); enum nft_chain_types { NFT_CHAIN_T_DEFAULT = 0, NFT_CHAIN_T_ROUTE, NFT_CHAIN_T_NAT, NFT_CHAIN_T_MAX }; /** * struct nft_chain_type - nf_tables chain type info * * @name: name of the type * @type: numeric identifier * @family: address family * @owner: module owner * @hook_mask: mask of valid hooks * @hooks: array of hook functions * @ops_register: base chain register function * @ops_unregister: base chain unregister function */ struct nft_chain_type { const char *name; enum nft_chain_types type; int family; struct module *owner; unsigned int hook_mask; nf_hookfn *hooks[NFT_MAX_HOOKS]; int (*ops_register)(struct net *net, const struct nf_hook_ops *ops); void (*ops_unregister)(struct net *net, const struct nf_hook_ops *ops); }; int nft_chain_validate_dependency(const struct nft_chain *chain, enum nft_chain_types type); int nft_chain_validate_hooks(const struct nft_chain *chain, unsigned int hook_flags); static inline bool nft_chain_binding(const struct nft_chain *chain) { return chain->flags & NFT_CHAIN_BINDING; } static inline bool nft_chain_is_bound(struct nft_chain *chain) { return (chain->flags & NFT_CHAIN_BINDING) && chain->bound; } int nft_chain_add(struct nft_table *table, struct nft_chain *chain); void nft_chain_del(struct nft_chain *chain); void nf_tables_chain_destroy(struct nft_ctx *ctx); struct nft_stats { u64 bytes; u64 pkts; struct u64_stats_sync syncp; }; struct nft_hook { struct list_head list; struct nf_hook_ops ops; struct rcu_head rcu; }; /** * struct nft_base_chain - nf_tables base chain * * @ops: netfilter hook ops * @hook_list: list of netfilter hooks (for NFPROTO_NETDEV family) * @type: chain type * @policy: default policy * @stats: per-cpu chain stats * @chain: the chain * @flow_block: flow block (for hardware offload) */ struct nft_base_chain { struct nf_hook_ops ops; struct list_head hook_list; const struct nft_chain_type *type; u8 policy; u8 flags; struct nft_stats __percpu *stats; struct nft_chain chain; struct flow_block flow_block; }; static inline struct nft_base_chain *nft_base_chain(const struct nft_chain *chain) { return container_of(chain, struct nft_base_chain, chain); } static inline bool nft_is_base_chain(const struct nft_chain *chain) { return chain->flags & NFT_CHAIN_BASE; } int __nft_release_basechain(struct nft_ctx *ctx); unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv); static inline bool nft_use_inc(u32 *use) { if (*use == UINT_MAX) return false; (*use)++; return true; } static inline void nft_use_dec(u32 *use) { WARN_ON_ONCE((*use)-- == 0); } /* For error and abort path: restore use counter to previous state. */ static inline void nft_use_inc_restore(u32 *use) { WARN_ON_ONCE(!nft_use_inc(use)); } #define nft_use_dec_restore nft_use_dec /** * struct nft_table - nf_tables table * * @list: used internally * @chains_ht: chains in the table * @chains: same, for stable walks * @sets: sets in the table * @objects: stateful objects in the table * @flowtables: flow tables in the table * @hgenerator: handle generator state * @handle: table handle * @use: number of chain references to this table * @flags: table flag (see enum nft_table_flags) * @genmask: generation mask * @afinfo: address family info * @name: name of the table * @validate_state: internal, set when transaction adds jumps */ struct nft_table { struct list_head list; struct rhltable chains_ht; struct list_head chains; struct list_head sets; struct list_head objects; struct list_head flowtables; u64 hgenerator; u64 handle; u32 use; u16 family:6, flags:8, genmask:2; u32 nlpid; char *name; u16 udlen; u8 *udata; u8 validate_state; }; static inline bool nft_table_has_owner(const struct nft_table *table) { return table->flags & NFT_TABLE_F_OWNER; } static inline bool nft_base_chain_netdev(int family, u32 hooknum) { return family == NFPROTO_NETDEV || (family == NFPROTO_INET && hooknum == NF_INET_INGRESS); } void nft_register_chain_type(const struct nft_chain_type *); void nft_unregister_chain_type(const struct nft_chain_type *); int nft_register_expr(struct nft_expr_type *); void nft_unregister_expr(struct nft_expr_type *); int nft_verdict_dump(struct sk_buff *skb, int type, const struct nft_verdict *v); /** * struct nft_object_hash_key - key to lookup nft_object * * @name: name of the stateful object to look up * @table: table the object belongs to */ struct nft_object_hash_key { const char *name; const struct nft_table *table; }; /** * struct nft_object - nf_tables stateful object * * @list: table stateful object list node * @key: keys that identify this object * @rhlhead: nft_objname_ht node * @genmask: generation mask * @use: number of references to this stateful object * @handle: unique object handle * @ops: object operations * @data: object data, layout depends on type */ struct nft_object { struct list_head list; struct rhlist_head rhlhead; struct nft_object_hash_key key; u32 genmask:2; u32 use; u64 handle; u16 udlen; u8 *udata; /* runtime data below here */ const struct nft_object_ops *ops ____cacheline_aligned; unsigned char data[] __attribute__((aligned(__alignof__(u64)))); }; static inline void *nft_obj_data(const struct nft_object *obj) { return (void *)obj->data; } #define nft_expr_obj(expr) *((struct nft_object **)nft_expr_priv(expr)) struct nft_object *nft_obj_lookup(const struct net *net, const struct nft_table *table, const struct nlattr *nla, u32 objtype, u8 genmask); void nft_obj_notify(struct net *net, const struct nft_table *table, struct nft_object *obj, u32 portid, u32 seq, int event, u16 flags, int family, int report, gfp_t gfp); /** * struct nft_object_type - stateful object type * * @select_ops: function to select nft_object_ops * @ops: default ops, used when no select_ops functions is present * @list: list node in list of object types * @type: stateful object numeric type * @owner: module owner * @maxattr: maximum netlink attribute * @policy: netlink attribute policy */ struct nft_object_type { const struct nft_object_ops *(*select_ops)(const struct nft_ctx *, const struct nlattr * const tb[]); const struct nft_object_ops *ops; struct list_head list; u32 type; unsigned int maxattr; struct module *owner; const struct nla_policy *policy; }; /** * struct nft_object_ops - stateful object operations * * @eval: stateful object evaluation function * @size: stateful object size * @init: initialize object from netlink attributes * @destroy: release existing stateful object * @dump: netlink dump stateful object * @update: update stateful object */ struct nft_object_ops { void (*eval)(struct nft_object *obj, struct nft_regs *regs, const struct nft_pktinfo *pkt); unsigned int size; int (*init)(const struct nft_ctx *ctx, const struct nlattr *const tb[], struct nft_object *obj); void (*destroy)(const struct nft_ctx *ctx, struct nft_object *obj); int (*dump)(struct sk_buff *skb, struct nft_object *obj, bool reset); void (*update)(struct nft_object *obj, struct nft_object *newobj); const struct nft_object_type *type; }; int nft_register_obj(struct nft_object_type *obj_type); void nft_unregister_obj(struct nft_object_type *obj_type); #define NFT_NETDEVICE_MAX 256 /** * struct nft_flowtable - nf_tables flow table * * @list: flow table list node in table list * @table: the table the flow table is contained in * @name: name of this flow table * @hooknum: hook number * @ops_len: number of hooks in array * @genmask: generation mask * @use: number of references to this flow table * @handle: unique object handle * @dev_name: array of device names * @data: rhashtable and garbage collector * @ops: array of hooks */ struct nft_flowtable { struct list_head list; struct nft_table *table; char *name; int hooknum; int ops_len; u32 genmask:2; u32 use; u64 handle; /* runtime data below here */ struct list_head hook_list ____cacheline_aligned; struct nf_flowtable data; }; struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table, const struct nlattr *nla, u8 genmask); void nf_tables_deactivate_flowtable(const struct nft_ctx *ctx, struct nft_flowtable *flowtable, enum nft_trans_phase phase); void nft_register_flowtable_type(struct nf_flowtable_type *type); void nft_unregister_flowtable_type(struct nf_flowtable_type *type); /** * struct nft_traceinfo - nft tracing information and state * * @trace: other struct members are initialised * @nf_trace: copy of skb->nf_trace before rule evaluation * @type: event type (enum nft_trace_types) * @skbid: hash of skb to be used as trace id * @packet_dumped: packet headers sent in a previous traceinfo message * @basechain: base chain currently processed */ struct nft_traceinfo { bool trace; bool nf_trace; bool packet_dumped; enum nft_trace_types type:8; u32 skbid; const struct nft_base_chain *basechain; }; void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt, const struct nft_chain *basechain); void nft_trace_notify(const struct nft_pktinfo *pkt, const struct nft_verdict *verdict, const struct nft_rule_dp *rule, struct nft_traceinfo *info); #define MODULE_ALIAS_NFT_CHAIN(family, name) \ MODULE_ALIAS("nft-chain-" __stringify(family) "-" name) #define MODULE_ALIAS_NFT_AF_EXPR(family, name) \ MODULE_ALIAS("nft-expr-" __stringify(family) "-" name) #define MODULE_ALIAS_NFT_EXPR(name) \ MODULE_ALIAS("nft-expr-" name) #define MODULE_ALIAS_NFT_OBJ(type) \ MODULE_ALIAS("nft-obj-" __stringify(type)) #if IS_ENABLED(CONFIG_NF_TABLES) /* * The gencursor defines two generations, the currently active and the * next one. Objects contain a bitmask of 2 bits specifying the generations * they're active in. A set bit means they're inactive in the generation * represented by that bit. * * New objects start out as inactive in the current and active in the * next generation. When committing the ruleset the bitmask is cleared, * meaning they're active in all generations. When removing an object, * it is set inactive in the next generation. After committing the ruleset, * the objects are removed. */ static inline unsigned int nft_gencursor_next(const struct net *net) { return net->nft.gencursor + 1 == 1 ? 1 : 0; } static inline u8 nft_genmask_next(const struct net *net) { return 1 << nft_gencursor_next(net); } static inline u8 nft_genmask_cur(const struct net *net) { /* Use READ_ONCE() to prevent refetching the value for atomicity */ return 1 << READ_ONCE(net->nft.gencursor); } #define NFT_GENMASK_ANY ((1 << 0) | (1 << 1)) /* * Generic transaction helpers */ /* Check if this object is currently active. */ #define nft_is_active(__net, __obj) \ (((__obj)->genmask & nft_genmask_cur(__net)) == 0) /* Check if this object is active in the next generation. */ #define nft_is_active_next(__net, __obj) \ (((__obj)->genmask & nft_genmask_next(__net)) == 0) /* This object becomes active in the next generation. */ #define nft_activate_next(__net, __obj) \ (__obj)->genmask = nft_genmask_cur(__net) /* This object becomes inactive in the next generation. */ #define nft_deactivate_next(__net, __obj) \ (__obj)->genmask = nft_genmask_next(__net) /* After committing the ruleset, clear the stale generation bit. */ #define nft_clear(__net, __obj) \ (__obj)->genmask &= ~nft_genmask_next(__net) #define nft_active_genmask(__obj, __genmask) \ !((__obj)->genmask & __genmask) /* * Set element transaction helpers */ static inline bool nft_set_elem_active(const struct nft_set_ext *ext, u8 genmask) { return !(ext->genmask & genmask); } static inline void nft_set_elem_change_active(const struct net *net, const struct nft_set *set, struct nft_set_ext *ext) { ext->genmask ^= nft_genmask_next(net); } #endif /* IS_ENABLED(CONFIG_NF_TABLES) */ #define NFT_SET_ELEM_DEAD_MASK (1 << 2) #if defined(__LITTLE_ENDIAN_BITFIELD) #define NFT_SET_ELEM_DEAD_BIT 2 #elif defined(__BIG_ENDIAN_BITFIELD) #define NFT_SET_ELEM_DEAD_BIT (BITS_PER_LONG - BITS_PER_BYTE + 2) #else #error #endif static inline void nft_set_elem_dead(struct nft_set_ext *ext) { unsigned long *word = (unsigned long *)ext; BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); set_bit(NFT_SET_ELEM_DEAD_BIT, word); } static inline int nft_set_elem_is_dead(const struct nft_set_ext *ext) { unsigned long *word = (unsigned long *)ext; BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); return test_bit(NFT_SET_ELEM_DEAD_BIT, word); } /** * struct nft_trans - nf_tables object update in transaction * * @list: used internally * @binding_list: list of objects with possible bindings * @msg_type: message type * @put_net: ctx->net needs to be put * @ctx: transaction context * @data: internal information related to the transaction */ struct nft_trans { struct list_head list; struct list_head binding_list; int msg_type; bool put_net; struct nft_ctx ctx; char data[]; }; struct nft_trans_rule { struct nft_rule *rule; struct nft_flow_rule *flow; u32 rule_id; bool bound; }; #define nft_trans_rule(trans) \ (((struct nft_trans_rule *)trans->data)->rule) #define nft_trans_flow_rule(trans) \ (((struct nft_trans_rule *)trans->data)->flow) #define nft_trans_rule_id(trans) \ (((struct nft_trans_rule *)trans->data)->rule_id) #define nft_trans_rule_bound(trans) \ (((struct nft_trans_rule *)trans->data)->bound) struct nft_trans_set { struct nft_set *set; u32 set_id; u32 gc_int; u64 timeout; bool update; bool bound; u32 size; }; #define nft_trans_set(trans) \ (((struct nft_trans_set *)trans->data)->set) #define nft_trans_set_id(trans) \ (((struct nft_trans_set *)trans->data)->set_id) #define nft_trans_set_bound(trans) \ (((struct nft_trans_set *)trans->data)->bound) #define nft_trans_set_update(trans) \ (((struct nft_trans_set *)trans->data)->update) #define nft_trans_set_timeout(trans) \ (((struct nft_trans_set *)trans->data)->timeout) #define nft_trans_set_gc_int(trans) \ (((struct nft_trans_set *)trans->data)->gc_int) #define nft_trans_set_size(trans) \ (((struct nft_trans_set *)trans->data)->size) struct nft_trans_chain { struct nft_chain *chain; bool update; char *name; struct nft_stats __percpu *stats; u8 policy; bool bound; u32 chain_id; struct nft_base_chain *basechain; struct list_head hook_list; }; #define nft_trans_chain(trans) \ (((struct nft_trans_chain *)trans->data)->chain) #define nft_trans_chain_update(trans) \ (((struct nft_trans_chain *)trans->data)->update) #define nft_trans_chain_name(trans) \ (((struct nft_trans_chain *)trans->data)->name) #define nft_trans_chain_stats(trans) \ (((struct nft_trans_chain *)trans->data)->stats) #define nft_trans_chain_policy(trans) \ (((struct nft_trans_chain *)trans->data)->policy) #define nft_trans_chain_bound(trans) \ (((struct nft_trans_chain *)trans->data)->bound) #define nft_trans_chain_id(trans) \ (((struct nft_trans_chain *)trans->data)->chain_id) #define nft_trans_basechain(trans) \ (((struct nft_trans_chain *)trans->data)->basechain) #define nft_trans_chain_hooks(trans) \ (((struct nft_trans_chain *)trans->data)->hook_list) struct nft_trans_table { bool update; }; #define nft_trans_table_update(trans) \ (((struct nft_trans_table *)trans->data)->update) struct nft_trans_elem { struct nft_set *set; struct nft_set_elem elem; bool bound; }; #define nft_trans_elem_set(trans) \ (((struct nft_trans_elem *)trans->data)->set) #define nft_trans_elem(trans) \ (((struct nft_trans_elem *)trans->data)->elem) #define nft_trans_elem_set_bound(trans) \ (((struct nft_trans_elem *)trans->data)->bound) struct nft_trans_obj { struct nft_object *obj; struct nft_object *newobj; bool update; }; #define nft_trans_obj(trans) \ (((struct nft_trans_obj *)trans->data)->obj) #define nft_trans_obj_newobj(trans) \ (((struct nft_trans_obj *)trans->data)->newobj) #define nft_trans_obj_update(trans) \ (((struct nft_trans_obj *)trans->data)->update) struct nft_trans_flowtable { struct nft_flowtable *flowtable; bool update; struct list_head hook_list; u32 flags; }; #define nft_trans_flowtable(trans) \ (((struct nft_trans_flowtable *)trans->data)->flowtable) #define nft_trans_flowtable_update(trans) \ (((struct nft_trans_flowtable *)trans->data)->update) #define nft_trans_flowtable_hooks(trans) \ (((struct nft_trans_flowtable *)trans->data)->hook_list) #define nft_trans_flowtable_flags(trans) \ (((struct nft_trans_flowtable *)trans->data)->flags) #define NFT_TRANS_GC_BATCHCOUNT 256 struct nft_trans_gc { struct list_head list; struct net *net; struct nft_set *set; u32 seq; u16 count; void *priv[NFT_TRANS_GC_BATCHCOUNT]; struct rcu_head rcu; }; struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set, unsigned int gc_seq, gfp_t gfp); void nft_trans_gc_destroy(struct nft_trans_gc *trans); struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, unsigned int gc_seq, gfp_t gfp); void nft_trans_gc_queue_async_done(struct nft_trans_gc *gc); struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp); void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans); void nft_trans_gc_elem_add(struct nft_trans_gc *gc, void *priv); struct nft_trans_gc *nft_trans_gc_catchall_async(struct nft_trans_gc *gc, unsigned int gc_seq); struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc); void nft_setelem_data_deactivate(const struct net *net, const struct nft_set *set, struct nft_set_elem *elem); int __init nft_chain_filter_init(void); void nft_chain_filter_fini(void); void __init nft_chain_route_init(void); void nft_chain_route_fini(void); void nf_tables_trans_destroy_flush_work(void); int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result); __be64 nf_jiffies64_to_msecs(u64 input); #ifdef CONFIG_MODULES __printf(2, 3) int nft_request_module(struct net *net, const char *fmt, ...); #else static inline int nft_request_module(struct net *net, const char *fmt, ...) { return -ENOENT; } #endif struct nftables_pernet { struct list_head tables; struct list_head commit_list; struct list_head binding_list; struct list_head module_list; struct list_head notify_list; struct mutex commit_mutex; u64 table_handle; unsigned int base_seq; unsigned int gc_seq; u8 validate_state; }; extern unsigned int nf_tables_net_id; static inline struct nftables_pernet *nft_pernet(const struct net *net) { return net_generic(net, nf_tables_net_id); } #define __NFT_REDUCE_READONLY 1UL #define NFT_REDUCE_READONLY (void *)__NFT_REDUCE_READONLY static inline bool nft_reduce_is_readonly(const struct nft_expr *expr) { return expr->ops->reduce == NFT_REDUCE_READONLY; } void nft_reg_track_update(struct nft_regs_track *track, const struct nft_expr *expr, u8 dreg, u8 len); void nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg, u8 len); void __nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg); static inline bool nft_reg_track_cmp(struct nft_regs_track *track, const struct nft_expr *expr, u8 dreg) { return track->regs[dreg].selector && track->regs[dreg].selector->ops == expr->ops && track->regs[dreg].num_reg == 0; } #endif /* _NET_NF_TABLES_H */ |
358 417 78 1062 90 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 | /** * css_get - obtain a reference on the specified css * @css: target css * * The caller must already have a reference. */ CGROUP_REF_FN_ATTRS void css_get(struct cgroup_subsys_state *css) { if (!(css->flags & CSS_NO_REF)) percpu_ref_get(&css->refcnt); } CGROUP_REF_EXPORT(css_get) /** * css_get_many - obtain references on the specified css * @css: target css * @n: number of references to get * * The caller must already have a reference. */ CGROUP_REF_FN_ATTRS void css_get_many(struct cgroup_subsys_state *css, unsigned int n) { if (!(css->flags & CSS_NO_REF)) percpu_ref_get_many(&css->refcnt, n); } CGROUP_REF_EXPORT(css_get_many) /** * css_tryget - try to obtain a reference on the specified css * @css: target css * * Obtain a reference on @css unless it already has reached zero and is * being released. This function doesn't care whether @css is on or * offline. The caller naturally needs to ensure that @css is accessible * but doesn't have to be holding a reference on it - IOW, RCU protected * access is good enough for this function. Returns %true if a reference * count was successfully obtained; %false otherwise. */ CGROUP_REF_FN_ATTRS bool css_tryget(struct cgroup_subsys_state *css) { if (!(css->flags & CSS_NO_REF)) return percpu_ref_tryget(&css->refcnt); return true; } CGROUP_REF_EXPORT(css_tryget) /** * css_tryget_online - try to obtain a reference on the specified css if online * @css: target css * * Obtain a reference on @css if it's online. The caller naturally needs * to ensure that @css is accessible but doesn't have to be holding a * reference on it - IOW, RCU protected access is good enough for this * function. Returns %true if a reference count was successfully obtained; * %false otherwise. */ CGROUP_REF_FN_ATTRS bool css_tryget_online(struct cgroup_subsys_state *css) { if (!(css->flags & CSS_NO_REF)) return percpu_ref_tryget_live(&css->refcnt); return true; } CGROUP_REF_EXPORT(css_tryget_online) /** * css_put - put a css reference * @css: target css * * Put a reference obtained via css_get() and css_tryget_online(). */ CGROUP_REF_FN_ATTRS void css_put(struct cgroup_subsys_state *css) { if (!(css->flags & CSS_NO_REF)) percpu_ref_put(&css->refcnt); } CGROUP_REF_EXPORT(css_put) /** * css_put_many - put css references * @css: target css * @n: number of references to put * * Put references obtained via css_get() and css_tryget_online(). */ CGROUP_REF_FN_ATTRS void css_put_many(struct cgroup_subsys_state *css, unsigned int n) { if (!(css->flags & CSS_NO_REF)) percpu_ref_put_many(&css->refcnt, n); } CGROUP_REF_EXPORT(css_put_many) |
1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 | // SPDX-License-Identifier: GPL-2.0-only /* * debugfs code for HSR & PRP * Copyright (C) 2019 Texas Instruments Incorporated * * Author(s): * Murali Karicheri <m-karicheri2@ti.com> */ #include <linux/module.h> #include <linux/errno.h> #include <linux/debugfs.h> #include "hsr_main.h" #include "hsr_framereg.h" static struct dentry *hsr_debugfs_root_dir; /* hsr_node_table_show - Formats and prints node_table entries */ static int hsr_node_table_show(struct seq_file *sfp, void *data) { struct hsr_priv *priv = (struct hsr_priv *)sfp->private; struct hsr_node *node; seq_printf(sfp, "Node Table entries for (%s) device\n", (priv->prot_version == PRP_V1 ? "PRP" : "HSR")); seq_puts(sfp, "MAC-Address-A, MAC-Address-B, time_in[A], "); seq_puts(sfp, "time_in[B], Address-B port, "); if (priv->prot_version == PRP_V1) seq_puts(sfp, "SAN-A, SAN-B, DAN-P\n"); else seq_puts(sfp, "DAN-H\n"); rcu_read_lock(); list_for_each_entry_rcu(node, &priv->node_db, mac_list) { /* skip self node */ if (hsr_addr_is_self(priv, node->macaddress_A)) continue; seq_printf(sfp, "%pM ", &node->macaddress_A[0]); seq_printf(sfp, "%pM ", &node->macaddress_B[0]); seq_printf(sfp, "%10lx, ", node->time_in[HSR_PT_SLAVE_A]); seq_printf(sfp, "%10lx, ", node->time_in[HSR_PT_SLAVE_B]); seq_printf(sfp, "%14x, ", node->addr_B_port); if (priv->prot_version == PRP_V1) seq_printf(sfp, "%5x, %5x, %5x\n", node->san_a, node->san_b, (node->san_a == 0 && node->san_b == 0)); else seq_printf(sfp, "%5x\n", 1); } rcu_read_unlock(); return 0; } DEFINE_SHOW_ATTRIBUTE(hsr_node_table); void hsr_debugfs_rename(struct net_device *dev) { struct hsr_priv *priv = netdev_priv(dev); struct dentry *d; d = debugfs_rename(hsr_debugfs_root_dir, priv->node_tbl_root, hsr_debugfs_root_dir, dev->name); if (IS_ERR(d)) netdev_warn(dev, "failed to rename\n"); else priv->node_tbl_root = d; } /* hsr_debugfs_init - create hsr node_table file for dumping * the node table * * Description: * When debugfs is configured this routine sets up the node_table file per * hsr device for dumping the node_table entries */ void hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev) { struct dentry *de = NULL; de = debugfs_create_dir(hsr_dev->name, hsr_debugfs_root_dir); if (IS_ERR(de)) { pr_err("Cannot create hsr debugfs directory\n"); return; } priv->node_tbl_root = de; de = debugfs_create_file("node_table", S_IFREG | 0444, priv->node_tbl_root, priv, &hsr_node_table_fops); if (IS_ERR(de)) { pr_err("Cannot create hsr node_table file\n"); debugfs_remove(priv->node_tbl_root); priv->node_tbl_root = NULL; return; } } /* hsr_debugfs_term - Tear down debugfs intrastructure * * Description: * When Debugfs is configured this routine removes debugfs file system * elements that are specific to hsr */ void hsr_debugfs_term(struct hsr_priv *priv) { debugfs_remove_recursive(priv->node_tbl_root); priv->node_tbl_root = NULL; } void hsr_debugfs_create_root(void) { hsr_debugfs_root_dir = debugfs_create_dir("hsr", NULL); if (IS_ERR(hsr_debugfs_root_dir)) { pr_err("Cannot create hsr debugfs root directory\n"); hsr_debugfs_root_dir = NULL; } } void hsr_debugfs_remove_root(void) { /* debugfs_remove() internally checks NULL and ERROR */ debugfs_remove(hsr_debugfs_root_dir); } |
53 53 26 68 48 31 2 3 26 49 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _IPV6_FRAG_H #define _IPV6_FRAG_H #include <linux/icmpv6.h> #include <linux/kernel.h> #include <net/addrconf.h> #include <net/ipv6.h> #include <net/inet_frag.h> enum ip6_defrag_users { IP6_DEFRAG_LOCAL_DELIVER, IP6_DEFRAG_CONNTRACK_IN, __IP6_DEFRAG_CONNTRACK_IN = IP6_DEFRAG_CONNTRACK_IN + USHRT_MAX, IP6_DEFRAG_CONNTRACK_OUT, __IP6_DEFRAG_CONNTRACK_OUT = IP6_DEFRAG_CONNTRACK_OUT + USHRT_MAX, IP6_DEFRAG_CONNTRACK_BRIDGE_IN, __IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX, }; /* * Equivalent of ipv4 struct ip */ struct frag_queue { struct inet_frag_queue q; int iif; __u16 nhoffset; u8 ecn; }; #if IS_ENABLED(CONFIG_IPV6) static inline void ip6frag_init(struct inet_frag_queue *q, const void *a) { struct frag_queue *fq = container_of(q, struct frag_queue, q); const struct frag_v6_compare_key *key = a; q->key.v6 = *key; fq->ecn = 0; } static inline u32 ip6frag_key_hashfn(const void *data, u32 len, u32 seed) { return jhash2(data, sizeof(struct frag_v6_compare_key) / sizeof(u32), seed); } static inline u32 ip6frag_obj_hashfn(const void *data, u32 len, u32 seed) { const struct inet_frag_queue *fq = data; return jhash2((const u32 *)&fq->key.v6, sizeof(struct frag_v6_compare_key) / sizeof(u32), seed); } static inline int ip6frag_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) { const struct frag_v6_compare_key *key = arg->key; const struct inet_frag_queue *fq = ptr; return !!memcmp(&fq->key, key, sizeof(*key)); } static inline void ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq) { struct net_device *dev = NULL; struct sk_buff *head; rcu_read_lock(); /* Paired with the WRITE_ONCE() in fqdir_pre_exit(). */ if (READ_ONCE(fq->q.fqdir->dead)) goto out_rcu_unlock; spin_lock(&fq->q.lock); if (fq->q.flags & INET_FRAG_COMPLETE) goto out; fq->q.flags |= INET_FRAG_DROP; inet_frag_kill(&fq->q); dev = dev_get_by_index_rcu(net, fq->iif); if (!dev) goto out; __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); /* Don't send error if the first segment did not arrive. */ if (!(fq->q.flags & INET_FRAG_FIRST_IN)) goto out; /* sk_buff::dev and sk_buff::rbnode are unionized. So we * pull the head out of the tree in order to be able to * deal with head->dev. */ head = inet_frag_pull_head(&fq->q); if (!head) goto out; head->dev = dev; spin_unlock(&fq->q.lock); icmpv6_send(head, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0); kfree_skb_reason(head, SKB_DROP_REASON_FRAG_REASM_TIMEOUT); goto out_rcu_unlock; out: spin_unlock(&fq->q.lock); out_rcu_unlock: rcu_read_unlock(); inet_frag_put(&fq->q); } /* Check if the upper layer header is truncated in the first fragment. */ static inline bool ipv6frag_thdr_truncated(struct sk_buff *skb, int start, u8 *nexthdrp) { u8 nexthdr = *nexthdrp; __be16 frag_off; int offset; offset = ipv6_skip_exthdr(skb, start, &nexthdr, &frag_off); if (offset < 0 || (frag_off & htons(IP6_OFFSET))) return false; switch (nexthdr) { case NEXTHDR_TCP: offset += sizeof(struct tcphdr); break; case NEXTHDR_UDP: offset += sizeof(struct udphdr); break; case NEXTHDR_ICMP: offset += sizeof(struct icmp6hdr); break; default: offset += 1; } if (offset > skb->len) return true; return false; } #endif #endif |
1648 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef MPLS_INTERNAL_H #define MPLS_INTERNAL_H #include <net/mpls.h> /* put a reasonable limit on the number of labels * we will accept from userspace */ #define MAX_NEW_LABELS 30 struct mpls_entry_decoded { u32 label; u8 ttl; u8 tc; u8 bos; }; struct mpls_pcpu_stats { struct mpls_link_stats stats; struct u64_stats_sync syncp; }; struct mpls_dev { int input_enabled; struct net_device *dev; struct mpls_pcpu_stats __percpu *stats; struct ctl_table_header *sysctl; struct rcu_head rcu; }; #if BITS_PER_LONG == 32 #define MPLS_INC_STATS_LEN(mdev, len, pkts_field, bytes_field) \ do { \ __typeof__(*(mdev)->stats) *ptr = \ raw_cpu_ptr((mdev)->stats); \ local_bh_disable(); \ u64_stats_update_begin(&ptr->syncp); \ ptr->stats.pkts_field++; \ ptr->stats.bytes_field += (len); \ u64_stats_update_end(&ptr->syncp); \ local_bh_enable(); \ } while (0) #define MPLS_INC_STATS(mdev, field) \ do { \ __typeof__(*(mdev)->stats) *ptr = \ raw_cpu_ptr((mdev)->stats); \ local_bh_disable(); \ u64_stats_update_begin(&ptr->syncp); \ ptr->stats.field++; \ u64_stats_update_end(&ptr->syncp); \ local_bh_enable(); \ } while (0) #else #define MPLS_INC_STATS_LEN(mdev, len, pkts_field, bytes_field) \ do { \ this_cpu_inc((mdev)->stats->stats.pkts_field); \ this_cpu_add((mdev)->stats->stats.bytes_field, (len)); \ } while (0) #define MPLS_INC_STATS(mdev, field) \ this_cpu_inc((mdev)->stats->stats.field) #endif struct sk_buff; #define LABEL_NOT_SPECIFIED (1 << 20) /* This maximum ha length copied from the definition of struct neighbour */ #define VIA_ALEN_ALIGN sizeof(unsigned long) #define MAX_VIA_ALEN (ALIGN(MAX_ADDR_LEN, VIA_ALEN_ALIGN)) enum mpls_payload_type { MPT_UNSPEC, /* IPv4 or IPv6 */ MPT_IPV4 = 4, MPT_IPV6 = 6, /* Other types not implemented: * - Pseudo-wire with or without control word (RFC4385) * - GAL (RFC5586) */ }; struct mpls_nh { /* next hop label forwarding entry */ struct net_device *nh_dev; /* nh_flags is accessed under RCU in the packet path; it is * modified handling netdev events with rtnl lock held */ unsigned int nh_flags; u8 nh_labels; u8 nh_via_alen; u8 nh_via_table; u8 nh_reserved1; u32 nh_label[]; }; /* offset of via from beginning of mpls_nh */ #define MPLS_NH_VIA_OFF(num_labels) \ ALIGN(sizeof(struct mpls_nh) + (num_labels) * sizeof(u32), \ VIA_ALEN_ALIGN) /* all nexthops within a route have the same size based on the * max number of labels and max via length across all nexthops */ #define MPLS_NH_SIZE(num_labels, max_via_alen) \ (MPLS_NH_VIA_OFF((num_labels)) + \ ALIGN((max_via_alen), VIA_ALEN_ALIGN)) enum mpls_ttl_propagation { MPLS_TTL_PROP_DEFAULT, MPLS_TTL_PROP_ENABLED, MPLS_TTL_PROP_DISABLED, }; /* The route, nexthops and vias are stored together in the same memory * block: * * +----------------------+ * | mpls_route | * +----------------------+ * | mpls_nh 0 | * +----------------------+ * | alignment padding | 4 bytes for odd number of labels * +----------------------+ * | via[rt_max_alen] 0 | * +----------------------+ * | alignment padding | via's aligned on sizeof(unsigned long) * +----------------------+ * | ... | * +----------------------+ * | mpls_nh n-1 | * +----------------------+ * | via[rt_max_alen] n-1 | * +----------------------+ */ struct mpls_route { /* next hop label forwarding entry */ struct rcu_head rt_rcu; u8 rt_protocol; u8 rt_payload_type; u8 rt_max_alen; u8 rt_ttl_propagate; u8 rt_nhn; /* rt_nhn_alive is accessed under RCU in the packet path; it * is modified handling netdev events with rtnl lock held */ u8 rt_nhn_alive; u8 rt_nh_size; u8 rt_via_offset; u8 rt_reserved1; struct mpls_nh rt_nh[]; }; #define for_nexthops(rt) { \ int nhsel; const struct mpls_nh *nh; \ for (nhsel = 0, nh = (rt)->rt_nh; \ nhsel < (rt)->rt_nhn; \ nh = (void *)nh + (rt)->rt_nh_size, nhsel++) #define change_nexthops(rt) { \ int nhsel; struct mpls_nh *nh; \ for (nhsel = 0, nh = (rt)->rt_nh; \ nhsel < (rt)->rt_nhn; \ nh = (void *)nh + (rt)->rt_nh_size, nhsel++) #define endfor_nexthops(rt) } static inline struct mpls_entry_decoded mpls_entry_decode(struct mpls_shim_hdr *hdr) { struct mpls_entry_decoded result; unsigned entry = be32_to_cpu(hdr->label_stack_entry); result.label = (entry & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT; result.ttl = (entry & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT; result.tc = (entry & MPLS_LS_TC_MASK) >> MPLS_LS_TC_SHIFT; result.bos = (entry & MPLS_LS_S_MASK) >> MPLS_LS_S_SHIFT; return result; } static inline struct mpls_dev *mpls_dev_get(const struct net_device *dev) { return rcu_dereference_rtnl(dev->mpls_ptr); } int nla_put_labels(struct sk_buff *skb, int attrtype, u8 labels, const u32 label[]); int nla_get_labels(const struct nlattr *nla, u8 max_labels, u8 *labels, u32 label[], struct netlink_ext_ack *extack); bool mpls_output_possible(const struct net_device *dev); unsigned int mpls_dev_mtu(const struct net_device *dev); bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu); void mpls_stats_inc_outucastpkts(struct net_device *dev, const struct sk_buff *skb); #endif /* MPLS_INTERNAL_H */ |
715 715 715 715 715 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 | // SPDX-License-Identifier: GPL-2.0 #include <linux/slab.h> #include <linux/kernel.h> #include <linux/bitops.h> #include <linux/cpumask.h> #include <linux/export.h> #include <linux/memblock.h> #include <linux/numa.h> /** * cpumask_next_wrap - helper to implement for_each_cpu_wrap * @n: the cpu prior to the place to search * @mask: the cpumask pointer * @start: the start point of the iteration * @wrap: assume @n crossing @start terminates the iteration * * Returns >= nr_cpu_ids on completion * * Note: the @wrap argument is required for the start condition when * we cannot assume @start is set in @mask. */ unsigned int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap) { unsigned int next; again: next = cpumask_next(n, mask); if (wrap && n < start && next >= start) { return nr_cpumask_bits; } else if (next >= nr_cpumask_bits) { wrap = true; n = -1; goto again; } return next; } EXPORT_SYMBOL(cpumask_next_wrap); /* These are not inline because of header tangles. */ #ifdef CONFIG_CPUMASK_OFFSTACK /** * alloc_cpumask_var_node - allocate a struct cpumask on a given node * @mask: pointer to cpumask_var_t where the cpumask is returned * @flags: GFP_ flags * @node: memory node from which to allocate or %NUMA_NO_NODE * * Only defined when CONFIG_CPUMASK_OFFSTACK=y, otherwise is * a nop returning a constant 1 (in <linux/cpumask.h>) * Returns TRUE if memory allocation succeeded, FALSE otherwise. * * In addition, mask will be NULL if this fails. Note that gcc is * usually smart enough to know that mask can never be NULL if * CONFIG_CPUMASK_OFFSTACK=n, so does code elimination in that case * too. */ bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) { *mask = kmalloc_node(cpumask_size(), flags, node); #ifdef CONFIG_DEBUG_PER_CPU_MAPS if (!*mask) { printk(KERN_ERR "=> alloc_cpumask_var: failed!\n"); dump_stack(); } #endif return *mask != NULL; } EXPORT_SYMBOL(alloc_cpumask_var_node); /** * alloc_bootmem_cpumask_var - allocate a struct cpumask from the bootmem arena. * @mask: pointer to cpumask_var_t where the cpumask is returned * * Only defined when CONFIG_CPUMASK_OFFSTACK=y, otherwise is * a nop (in <linux/cpumask.h>). * Either returns an allocated (zero-filled) cpumask, or causes the * system to panic. */ void __init alloc_bootmem_cpumask_var(cpumask_var_t *mask) { *mask = memblock_alloc(cpumask_size(), SMP_CACHE_BYTES); if (!*mask) panic("%s: Failed to allocate %u bytes\n", __func__, cpumask_size()); } /** * free_cpumask_var - frees memory allocated for a struct cpumask. * @mask: cpumask to free * * This is safe on a NULL mask. */ void free_cpumask_var(cpumask_var_t mask) { kfree(mask); } EXPORT_SYMBOL(free_cpumask_var); /** * free_bootmem_cpumask_var - frees result of alloc_bootmem_cpumask_var * @mask: cpumask to free */ void __init free_bootmem_cpumask_var(cpumask_var_t mask) { memblock_free(mask, cpumask_size()); } #endif /** * cpumask_local_spread - select the i'th cpu based on NUMA distances * @i: index number * @node: local numa_node * * Returns online CPU according to a numa aware policy; local cpus are returned * first, followed by non-local ones, then it wraps around. * * For those who wants to enumerate all CPUs based on their NUMA distances, * i.e. call this function in a loop, like: * * for (i = 0; i < num_online_cpus(); i++) { * cpu = cpumask_local_spread(i, node); * do_something(cpu); * } * * There's a better alternative based on for_each()-like iterators: * * for_each_numa_hop_mask(mask, node) { * for_each_cpu_andnot(cpu, mask, prev) * do_something(cpu); * prev = mask; * } * * It's simpler and more verbose than above. Complexity of iterator-based * enumeration is O(sched_domains_numa_levels * nr_cpu_ids), while * cpumask_local_spread() when called for each cpu is * O(sched_domains_numa_levels * nr_cpu_ids * log(nr_cpu_ids)). */ unsigned int cpumask_local_spread(unsigned int i, int node) { unsigned int cpu; /* Wrap: we always want a cpu. */ i %= num_online_cpus(); cpu = (node == NUMA_NO_NODE) ? cpumask_nth(i, cpu_online_mask) : sched_numa_find_nth_cpu(cpu_online_mask, i, node); WARN_ON(cpu >= nr_cpu_ids); return cpu; } EXPORT_SYMBOL(cpumask_local_spread); static DEFINE_PER_CPU(int, distribute_cpu_mask_prev); /** * cpumask_any_and_distribute - Return an arbitrary cpu within src1p & src2p. * @src1p: first &cpumask for intersection * @src2p: second &cpumask for intersection * * Iterated calls using the same srcp1 and srcp2 will be distributed within * their intersection. * * Returns >= nr_cpu_ids if the intersection is empty. */ unsigned int cpumask_any_and_distribute(const struct cpumask *src1p, const struct cpumask *src2p) { unsigned int next, prev; /* NOTE: our first selection will skip 0. */ prev = __this_cpu_read(distribute_cpu_mask_prev); next = find_next_and_bit_wrap(cpumask_bits(src1p), cpumask_bits(src2p), nr_cpumask_bits, prev + 1); if (next < nr_cpu_ids) __this_cpu_write(distribute_cpu_mask_prev, next); return next; } EXPORT_SYMBOL(cpumask_any_and_distribute); unsigned int cpumask_any_distribute(const struct cpumask *srcp) { unsigned int next, prev; /* NOTE: our first selection will skip 0. */ prev = __this_cpu_read(distribute_cpu_mask_prev); next = find_next_bit_wrap(cpumask_bits(srcp), nr_cpumask_bits, prev + 1); if (next < nr_cpu_ids) __this_cpu_write(distribute_cpu_mask_prev, next); return next; } EXPORT_SYMBOL(cpumask_any_distribute); |
2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 | /* SPDX-License-Identifier: GPL-2.0 */ /* * linux/mii.h: definitions for MII-compatible transceivers * Originally drivers/net/sunhme.h. * * Copyright (C) 1996, 1999, 2001 David S. Miller (davem@redhat.com) */ #ifndef __LINUX_MII_H__ #define __LINUX_MII_H__ #include <linux/if.h> #include <linux/linkmode.h> #include <uapi/linux/mii.h> struct ethtool_cmd; struct mii_if_info { int phy_id; int advertising; int phy_id_mask; int reg_num_mask; unsigned int full_duplex : 1; /* is full duplex? */ unsigned int force_media : 1; /* is autoneg. disabled? */ unsigned int supports_gmii : 1; /* are GMII registers supported? */ struct net_device *dev; int (*mdio_read) (struct net_device *dev, int phy_id, int location); void (*mdio_write) (struct net_device *dev, int phy_id, int location, int val); }; extern int mii_link_ok (struct mii_if_info *mii); extern int mii_nway_restart (struct mii_if_info *mii); extern void mii_ethtool_gset(struct mii_if_info *mii, struct ethtool_cmd *ecmd); extern void mii_ethtool_get_link_ksettings( struct mii_if_info *mii, struct ethtool_link_ksettings *cmd); extern int mii_ethtool_sset(struct mii_if_info *mii, struct ethtool_cmd *ecmd); extern int mii_ethtool_set_link_ksettings( struct mii_if_info *mii, const struct ethtool_link_ksettings *cmd); extern int mii_check_gmii_support(struct mii_if_info *mii); extern void mii_check_link (struct mii_if_info *mii); extern unsigned int mii_check_media (struct mii_if_info *mii, unsigned int ok_to_print, unsigned int init_media); extern int generic_mii_ioctl(struct mii_if_info *mii_if, struct mii_ioctl_data *mii_data, int cmd, unsigned int *duplex_changed); static inline struct mii_ioctl_data *if_mii(struct ifreq *rq) { return (struct mii_ioctl_data *) &rq->ifr_ifru; } /** * mii_nway_result * @negotiated: value of MII ANAR and'd with ANLPAR * * Given a set of MII abilities, check each bit and returns the * currently supported media, in the priority order defined by * IEEE 802.3u. We use LPA_xxx constants but note this is not the * value of LPA solely, as described above. * * The one exception to IEEE 802.3u is that 100baseT4 is placed * between 100T-full and 100T-half. If your phy does not support * 100T4 this is fine. If your phy places 100T4 elsewhere in the * priority order, you will need to roll your own function. */ static inline unsigned int mii_nway_result (unsigned int negotiated) { unsigned int ret; if (negotiated & LPA_100FULL) ret = LPA_100FULL; else if (negotiated & LPA_100BASE4) ret = LPA_100BASE4; else if (negotiated & LPA_100HALF) ret = LPA_100HALF; else if (negotiated & LPA_10FULL) ret = LPA_10FULL; else ret = LPA_10HALF; return ret; } /** * mii_duplex * @duplex_lock: Non-zero if duplex is locked at full * @negotiated: value of MII ANAR and'd with ANLPAR * * A small helper function for a common case. Returns one * if the media is operating or locked at full duplex, and * returns zero otherwise. */ static inline unsigned int mii_duplex (unsigned int duplex_lock, unsigned int negotiated) { if (duplex_lock) return 1; if (mii_nway_result(negotiated) & LPA_DUPLEX) return 1; return 0; } /** * ethtool_adv_to_mii_adv_t * @ethadv: the ethtool advertisement settings * * A small helper function that translates ethtool advertisement * settings to phy autonegotiation advertisements for the * MII_ADVERTISE register. */ static inline u32 ethtool_adv_to_mii_adv_t(u32 ethadv) { u32 result = 0; if (ethadv & ADVERTISED_10baseT_Half) result |= ADVERTISE_10HALF; if (ethadv & ADVERTISED_10baseT_Full) result |= ADVERTISE_10FULL; if (ethadv & ADVERTISED_100baseT_Half) result |= ADVERTISE_100HALF; if (ethadv & ADVERTISED_100baseT_Full) result |= ADVERTISE_100FULL; if (ethadv & ADVERTISED_Pause) result |= ADVERTISE_PAUSE_CAP; if (ethadv & ADVERTISED_Asym_Pause) result |= ADVERTISE_PAUSE_ASYM; return result; } /** * linkmode_adv_to_mii_adv_t * @advertising: the linkmode advertisement settings * * A small helper function that translates linkmode advertisement * settings to phy autonegotiation advertisements for the * MII_ADVERTISE register. */ static inline u32 linkmode_adv_to_mii_adv_t(unsigned long *advertising) { u32 result = 0; if (linkmode_test_bit(ETHTOOL_LINK_MODE_10baseT_Half_BIT, advertising)) result |= ADVERTISE_10HALF; if (linkmode_test_bit(ETHTOOL_LINK_MODE_10baseT_Full_BIT, advertising)) result |= ADVERTISE_10FULL; if (linkmode_test_bit(ETHTOOL_LINK_MODE_100baseT_Half_BIT, advertising)) result |= ADVERTISE_100HALF; if (linkmode_test_bit(ETHTOOL_LINK_MODE_100baseT_Full_BIT, advertising)) result |= ADVERTISE_100FULL; if (linkmode_test_bit(ETHTOOL_LINK_MODE_Pause_BIT, advertising)) result |= ADVERTISE_PAUSE_CAP; if (linkmode_test_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, advertising)) result |= ADVERTISE_PAUSE_ASYM; return result; } /** * mii_adv_to_ethtool_adv_t * @adv: value of the MII_ADVERTISE register * * A small helper function that translates MII_ADVERTISE bits * to ethtool advertisement settings. */ static inline u32 mii_adv_to_ethtool_adv_t(u32 adv) { u32 result = 0; if (adv & ADVERTISE_10HALF) result |= ADVERTISED_10baseT_Half; if (adv & ADVERTISE_10FULL) result |= ADVERTISED_10baseT_Full; if (adv & ADVERTISE_100HALF) result |= ADVERTISED_100baseT_Half; if (adv & ADVERTISE_100FULL) result |= ADVERTISED_100baseT_Full; if (adv & ADVERTISE_PAUSE_CAP) result |= ADVERTISED_Pause; if (adv & ADVERTISE_PAUSE_ASYM) result |= ADVERTISED_Asym_Pause; return result; } /** * ethtool_adv_to_mii_ctrl1000_t * @ethadv: the ethtool advertisement settings * * A small helper function that translates ethtool advertisement * settings to phy autonegotiation advertisements for the * MII_CTRL1000 register when in 1000T mode. */ static inline u32 ethtool_adv_to_mii_ctrl1000_t(u32 ethadv) { u32 result = 0; if (ethadv & ADVERTISED_1000baseT_Half) result |= ADVERTISE_1000HALF; if (ethadv & ADVERTISED_1000baseT_Full) result |= ADVERTISE_1000FULL; return result; } /** * linkmode_adv_to_mii_ctrl1000_t * @advertising: the linkmode advertisement settings * * A small helper function that translates linkmode advertisement * settings to phy autonegotiation advertisements for the * MII_CTRL1000 register when in 1000T mode. */ static inline u32 linkmode_adv_to_mii_ctrl1000_t(unsigned long *advertising) { u32 result = 0; if (linkmode_test_bit(ETHTOOL_LINK_MODE_1000baseT_Half_BIT, advertising)) result |= ADVERTISE_1000HALF; if (linkmode_test_bit(ETHTOOL_LINK_MODE_1000baseT_Full_BIT, advertising)) result |= ADVERTISE_1000FULL; return result; } /** * mii_ctrl1000_to_ethtool_adv_t * @adv: value of the MII_CTRL1000 register * * A small helper function that translates MII_CTRL1000 * bits, when in 1000Base-T mode, to ethtool * advertisement settings. */ static inline u32 mii_ctrl1000_to_ethtool_adv_t(u32 adv) { u32 result = 0; if (adv & ADVERTISE_1000HALF) result |= ADVERTISED_1000baseT_Half; if (adv & ADVERTISE_1000FULL) result |= ADVERTISED_1000baseT_Full; return result; } /** * mii_lpa_to_ethtool_lpa_t * @adv: value of the MII_LPA register * * A small helper function that translates MII_LPA * bits, when in 1000Base-T mode, to ethtool * LP advertisement settings. */ static inline u32 mii_lpa_to_ethtool_lpa_t(u32 lpa) { u32 result = 0; if (lpa & LPA_LPACK) result |= ADVERTISED_Autoneg; return result | mii_adv_to_ethtool_adv_t(lpa); } /** * mii_stat1000_to_ethtool_lpa_t * @adv: value of the MII_STAT1000 register * * A small helper function that translates MII_STAT1000 * bits, when in 1000Base-T mode, to ethtool * advertisement settings. */ static inline u32 mii_stat1000_to_ethtool_lpa_t(u32 lpa) { u32 result = 0; if (lpa & LPA_1000HALF) result |= ADVERTISED_1000baseT_Half; if (lpa & LPA_1000FULL) result |= ADVERTISED_1000baseT_Full; return result; } /** * mii_stat1000_mod_linkmode_lpa_t * @advertising: target the linkmode advertisement settings * @adv: value of the MII_STAT1000 register * * A small helper function that translates MII_STAT1000 bits, when in * 1000Base-T mode, to linkmode advertisement settings. Other bits in * advertising are not changes. */ static inline void mii_stat1000_mod_linkmode_lpa_t(unsigned long *advertising, u32 lpa) { linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseT_Half_BIT, advertising, lpa & LPA_1000HALF); linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseT_Full_BIT, advertising, lpa & LPA_1000FULL); } /** * ethtool_adv_to_mii_adv_x * @ethadv: the ethtool advertisement settings * * A small helper function that translates ethtool advertisement * settings to phy autonegotiation advertisements for the * MII_CTRL1000 register when in 1000Base-X mode. */ static inline u32 ethtool_adv_to_mii_adv_x(u32 ethadv) { u32 result = 0; if (ethadv & ADVERTISED_1000baseT_Half) result |= ADVERTISE_1000XHALF; if (ethadv & ADVERTISED_1000baseT_Full) result |= ADVERTISE_1000XFULL; if (ethadv & ADVERTISED_Pause) result |= ADVERTISE_1000XPAUSE; if (ethadv & ADVERTISED_Asym_Pause) result |= ADVERTISE_1000XPSE_ASYM; return result; } /** * mii_adv_to_ethtool_adv_x * @adv: value of the MII_CTRL1000 register * * A small helper function that translates MII_CTRL1000 * bits, when in 1000Base-X mode, to ethtool * advertisement settings. */ static inline u32 mii_adv_to_ethtool_adv_x(u32 adv) { u32 result = 0; if (adv & ADVERTISE_1000XHALF) result |= ADVERTISED_1000baseT_Half; if (adv & ADVERTISE_1000XFULL) result |= ADVERTISED_1000baseT_Full; if (adv & ADVERTISE_1000XPAUSE) result |= ADVERTISED_Pause; if (adv & ADVERTISE_1000XPSE_ASYM) result |= ADVERTISED_Asym_Pause; return result; } /** * mii_adv_mod_linkmode_adv_t * @advertising:pointer to destination link mode. * @adv: value of the MII_ADVERTISE register * * A small helper function that translates MII_ADVERTISE bits to * linkmode advertisement settings. Leaves other bits unchanged. */ static inline void mii_adv_mod_linkmode_adv_t(unsigned long *advertising, u32 adv) { linkmode_mod_bit(ETHTOOL_LINK_MODE_10baseT_Half_BIT, advertising, adv & ADVERTISE_10HALF); linkmode_mod_bit(ETHTOOL_LINK_MODE_10baseT_Full_BIT, advertising, adv & ADVERTISE_10FULL); linkmode_mod_bit(ETHTOOL_LINK_MODE_100baseT_Half_BIT, advertising, adv & ADVERTISE_100HALF); linkmode_mod_bit(ETHTOOL_LINK_MODE_100baseT_Full_BIT, advertising, adv & ADVERTISE_100FULL); linkmode_mod_bit(ETHTOOL_LINK_MODE_Pause_BIT, advertising, adv & ADVERTISE_PAUSE_CAP); linkmode_mod_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, advertising, adv & ADVERTISE_PAUSE_ASYM); } /** * mii_adv_to_linkmode_adv_t * @advertising:pointer to destination link mode. * @adv: value of the MII_ADVERTISE register * * A small helper function that translates MII_ADVERTISE bits * to linkmode advertisement settings. Clears the old value * of advertising. */ static inline void mii_adv_to_linkmode_adv_t(unsigned long *advertising, u32 adv) { linkmode_zero(advertising); mii_adv_mod_linkmode_adv_t(advertising, adv); } /** * mii_lpa_to_linkmode_lpa_t * @adv: value of the MII_LPA register * * A small helper function that translates MII_LPA bits, when in * 1000Base-T mode, to linkmode LP advertisement settings. Clears the * old value of advertising */ static inline void mii_lpa_to_linkmode_lpa_t(unsigned long *lp_advertising, u32 lpa) { mii_adv_to_linkmode_adv_t(lp_advertising, lpa); if (lpa & LPA_LPACK) linkmode_set_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, lp_advertising); } /** * mii_lpa_mod_linkmode_lpa_t * @adv: value of the MII_LPA register * * A small helper function that translates MII_LPA bits, when in * 1000Base-T mode, to linkmode LP advertisement settings. Leaves * other bits unchanged. */ static inline void mii_lpa_mod_linkmode_lpa_t(unsigned long *lp_advertising, u32 lpa) { mii_adv_mod_linkmode_adv_t(lp_advertising, lpa); linkmode_mod_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, lp_advertising, lpa & LPA_LPACK); } static inline void mii_ctrl1000_mod_linkmode_adv_t(unsigned long *advertising, u32 ctrl1000) { linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseT_Half_BIT, advertising, ctrl1000 & ADVERTISE_1000HALF); linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseT_Full_BIT, advertising, ctrl1000 & ADVERTISE_1000FULL); } /** * linkmode_adv_to_lcl_adv_t * @advertising:pointer to linkmode advertising * * A small helper function that translates linkmode advertising to LVL * pause capabilities. */ static inline u32 linkmode_adv_to_lcl_adv_t(unsigned long *advertising) { u32 lcl_adv = 0; if (linkmode_test_bit(ETHTOOL_LINK_MODE_Pause_BIT, advertising)) lcl_adv |= ADVERTISE_PAUSE_CAP; if (linkmode_test_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, advertising)) lcl_adv |= ADVERTISE_PAUSE_ASYM; return lcl_adv; } /** * mii_lpa_mod_linkmode_x - decode the link partner's config_reg to linkmodes * @linkmodes: link modes array * @lpa: config_reg word from link partner * @fd_bit: link mode for 1000XFULL bit */ static inline void mii_lpa_mod_linkmode_x(unsigned long *linkmodes, u16 lpa, int fd_bit) { linkmode_mod_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, linkmodes, lpa & LPA_LPACK); linkmode_mod_bit(ETHTOOL_LINK_MODE_Pause_BIT, linkmodes, lpa & LPA_1000XPAUSE); linkmode_mod_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, linkmodes, lpa & LPA_1000XPAUSE_ASYM); linkmode_mod_bit(fd_bit, linkmodes, lpa & LPA_1000XFULL); } /** * linkmode_adv_to_mii_adv_x - encode a linkmode to config_reg * @linkmodes: linkmodes * @fd_bit: full duplex bit */ static inline u16 linkmode_adv_to_mii_adv_x(const unsigned long *linkmodes, int fd_bit) { u16 adv = 0; if (linkmode_test_bit(fd_bit, linkmodes)) adv |= ADVERTISE_1000XFULL; if (linkmode_test_bit(ETHTOOL_LINK_MODE_Pause_BIT, linkmodes)) adv |= ADVERTISE_1000XPAUSE; if (linkmode_test_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, linkmodes)) adv |= ADVERTISE_1000XPSE_ASYM; return adv; } /** * mii_advertise_flowctrl - get flow control advertisement flags * @cap: Flow control capabilities (FLOW_CTRL_RX, FLOW_CTRL_TX or both) */ static inline u16 mii_advertise_flowctrl(int cap) { u16 adv = 0; if (cap & FLOW_CTRL_RX) adv = ADVERTISE_PAUSE_CAP | ADVERTISE_PAUSE_ASYM; if (cap & FLOW_CTRL_TX) adv ^= ADVERTISE_PAUSE_ASYM; return adv; } /** * mii_resolve_flowctrl_fdx * @lcladv: value of MII ADVERTISE register * @rmtadv: value of MII LPA register * * Resolve full duplex flow control as per IEEE 802.3-2005 table 28B-3 */ static inline u8 mii_resolve_flowctrl_fdx(u16 lcladv, u16 rmtadv) { u8 cap = 0; if (lcladv & rmtadv & ADVERTISE_PAUSE_CAP) { cap = FLOW_CTRL_TX | FLOW_CTRL_RX; } else if (lcladv & rmtadv & ADVERTISE_PAUSE_ASYM) { if (lcladv & ADVERTISE_PAUSE_CAP) cap = FLOW_CTRL_RX; else if (rmtadv & ADVERTISE_PAUSE_CAP) cap = FLOW_CTRL_TX; } return cap; } /** * mii_bmcr_encode_fixed - encode fixed speed/duplex settings to a BMCR value * @speed: a SPEED_* value * @duplex: a DUPLEX_* value * * Encode the speed and duplex to a BMCR value. 2500, 1000, 100 and 10 Mbps are * supported. 2500Mbps is encoded to 1000Mbps. Other speeds are encoded as 10 * Mbps. Unknown duplex values are encoded to half-duplex. */ static inline u16 mii_bmcr_encode_fixed(int speed, int duplex) { u16 bmcr; switch (speed) { case SPEED_2500: case SPEED_1000: bmcr = BMCR_SPEED1000; break; case SPEED_100: bmcr = BMCR_SPEED100; break; case SPEED_10: default: bmcr = BMCR_SPEED10; break; } if (duplex == DUPLEX_FULL) bmcr |= BMCR_FULLDPLX; return bmcr; } #endif /* __LINUX_MII_H__ */ |
1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 | /* Copyright (c) 2013 Coraid, Inc. See COPYING for GPL terms. */ /* * aoenet.c * Ethernet portion of AoE driver */ #include <linux/gfp.h> #include <linux/hdreg.h> #include <linux/blkdev.h> #include <linux/netdevice.h> #include <linux/moduleparam.h> #include <net/net_namespace.h> #include <asm/unaligned.h> #include "aoe.h" #define NECODES 5 static char *aoe_errlist[] = { "no such error", "unrecognized command code", "bad argument parameter", "device unavailable", "config string present", "unsupported version" }; enum { IFLISTSZ = 1024, }; static char aoe_iflist[IFLISTSZ]; module_param_string(aoe_iflist, aoe_iflist, IFLISTSZ, 0600); MODULE_PARM_DESC(aoe_iflist, "aoe_iflist=dev1[,dev2...]"); static wait_queue_head_t txwq; static struct ktstate kts; #ifndef MODULE static int __init aoe_iflist_setup(char *str) { strncpy(aoe_iflist, str, IFLISTSZ); aoe_iflist[IFLISTSZ - 1] = '\0'; return 1; } __setup("aoe_iflist=", aoe_iflist_setup); #endif static spinlock_t txlock; static struct sk_buff_head skbtxq; /* enters with txlock held */ static int tx(int id) __must_hold(&txlock) { struct sk_buff *skb; struct net_device *ifp; while ((skb = skb_dequeue(&skbtxq))) { spin_unlock_irq(&txlock); ifp = skb->dev; if (dev_queue_xmit(skb) == NET_XMIT_DROP && net_ratelimit()) pr_warn("aoe: packet could not be sent on %s. %s\n", ifp ? ifp->name : "netif", "consider increasing tx_queue_len"); spin_lock_irq(&txlock); } return 0; } int is_aoe_netif(struct net_device *ifp) { register char *p, *q; register int len; if (aoe_iflist[0] == '\0') return 1; p = aoe_iflist + strspn(aoe_iflist, WHITESPACE); for (; *p; p = q + strspn(q, WHITESPACE)) { q = p + strcspn(p, WHITESPACE); if (q != p) len = q - p; else len = strlen(p); /* last token in aoe_iflist */ if (strlen(ifp->name) == len && !strncmp(ifp->name, p, len)) return 1; if (q == p) break; } return 0; } int set_aoe_iflist(const char __user *user_str, size_t size) { if (size >= IFLISTSZ) return -EINVAL; if (copy_from_user(aoe_iflist, user_str, size)) { printk(KERN_INFO "aoe: copy from user failed\n"); return -EFAULT; } aoe_iflist[size] = 0x00; return 0; } void aoenet_xmit(struct sk_buff_head *queue) { struct sk_buff *skb, *tmp; ulong flags; skb_queue_walk_safe(queue, skb, tmp) { __skb_unlink(skb, queue); spin_lock_irqsave(&txlock, flags); skb_queue_tail(&skbtxq, skb); spin_unlock_irqrestore(&txlock, flags); wake_up(&txwq); } } /* * (1) len doesn't include the header by default. I want this. */ static int aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt, struct net_device *orig_dev) { struct aoe_hdr *h; struct aoe_atahdr *ah; u32 n; int sn; if (dev_net(ifp) != &init_net) goto exit; skb = skb_share_check(skb, GFP_ATOMIC); if (skb == NULL) return 0; if (!is_aoe_netif(ifp)) goto exit; skb_push(skb, ETH_HLEN); /* (1) */ sn = sizeof(*h) + sizeof(*ah); if (skb->len >= sn) { sn -= skb_headlen(skb); if (sn > 0 && !__pskb_pull_tail(skb, sn)) goto exit; } h = (struct aoe_hdr *) skb->data; n = get_unaligned_be32(&h->tag); if ((h->verfl & AOEFL_RSP) == 0 || (n & 1<<31)) goto exit; if (h->verfl & AOEFL_ERR) { n = h->err; if (n > NECODES) n = 0; if (net_ratelimit()) printk(KERN_ERR "%s%d.%d@%s; ecode=%d '%s'\n", "aoe: error packet from ", get_unaligned_be16(&h->major), h->minor, skb->dev->name, h->err, aoe_errlist[n]); goto exit; } switch (h->cmd) { case AOECMD_ATA: /* ata_rsp may keep skb for later processing or give it back */ skb = aoecmd_ata_rsp(skb); break; case AOECMD_CFG: aoecmd_cfg_rsp(skb); break; default: if (h->cmd >= AOECMD_VEND_MIN) break; /* don't complain about vendor commands */ pr_info("aoe: unknown AoE command type 0x%02x\n", h->cmd); break; } if (!skb) return 0; exit: dev_kfree_skb(skb); return 0; } static struct packet_type aoe_pt __read_mostly = { .type = __constant_htons(ETH_P_AOE), .func = aoenet_rcv, }; int __init aoenet_init(void) { skb_queue_head_init(&skbtxq); init_waitqueue_head(&txwq); spin_lock_init(&txlock); kts.lock = &txlock; kts.fn = tx; kts.waitq = &txwq; kts.id = 0; snprintf(kts.name, sizeof(kts.name), "aoe_tx%d", kts.id); if (aoe_ktstart(&kts)) return -EAGAIN; dev_add_pack(&aoe_pt); return 0; } void aoenet_exit(void) { aoe_ktstop(&kts); skb_queue_purge(&skbtxq); dev_remove_pack(&aoe_pt); } |
1 13 1 12 1 11 4 6 1 10 9 1 13 7 8 8 8 7 8 3 17 17 14 13 8 7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 | // SPDX-License-Identifier: GPL-2.0-only /* xfrm_user.c: User interface to configure xfrm engine. * * Copyright (C) 2002 David S. Miller (davem@redhat.com) * * Changes: * Mitsuru KANDA @USAGI * Kazunori MIYAZAWA @USAGI * Kunihiro Ishiguro <kunihiro@ipinfusion.com> * IPv6 support * */ #include <linux/compat.h> #include <linux/crypto.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/socket.h> #include <linux/string.h> #include <linux/net.h> #include <linux/skbuff.h> #include <linux/pfkeyv2.h> #include <linux/ipsec.h> #include <linux/init.h> #include <linux/security.h> #include <net/sock.h> #include <net/xfrm.h> #include <net/netlink.h> #include <net/ah.h> #include <linux/uaccess.h> #if IS_ENABLED(CONFIG_IPV6) #include <linux/in6.h> #endif #include <asm/unaligned.h> static int verify_one_alg(struct nlattr **attrs, enum xfrm_attr_type_t type, struct netlink_ext_ack *extack) { struct nlattr *rt = attrs[type]; struct xfrm_algo *algp; if (!rt) return 0; algp = nla_data(rt); if (nla_len(rt) < (int)xfrm_alg_len(algp)) { NL_SET_ERR_MSG(extack, "Invalid AUTH/CRYPT/COMP attribute length"); return -EINVAL; } switch (type) { case XFRMA_ALG_AUTH: case XFRMA_ALG_CRYPT: case XFRMA_ALG_COMP: break; default: NL_SET_ERR_MSG(extack, "Invalid algorithm attribute type"); return -EINVAL; } algp->alg_name[sizeof(algp->alg_name) - 1] = '\0'; return 0; } static int verify_auth_trunc(struct nlattr **attrs, struct netlink_ext_ack *extack) { struct nlattr *rt = attrs[XFRMA_ALG_AUTH_TRUNC]; struct xfrm_algo_auth *algp; if (!rt) return 0; algp = nla_data(rt); if (nla_len(rt) < (int)xfrm_alg_auth_len(algp)) { NL_SET_ERR_MSG(extack, "Invalid AUTH_TRUNC attribute length"); return -EINVAL; } algp->alg_name[sizeof(algp->alg_name) - 1] = '\0'; return 0; } static int verify_aead(struct nlattr **attrs, struct netlink_ext_ack *extack) { struct nlattr *rt = attrs[XFRMA_ALG_AEAD]; struct xfrm_algo_aead *algp; if (!rt) return 0; algp = nla_data(rt); if (nla_len(rt) < (int)aead_len(algp)) { NL_SET_ERR_MSG(extack, "Invalid AEAD attribute length"); return -EINVAL; } algp->alg_name[sizeof(algp->alg_name) - 1] = '\0'; return 0; } static void verify_one_addr(struct nlattr **attrs, enum xfrm_attr_type_t type, xfrm_address_t **addrp) { struct nlattr *rt = attrs[type]; if (rt && addrp) *addrp = nla_data(rt); } static inline int verify_sec_ctx_len(struct nlattr **attrs, struct netlink_ext_ack *extack) { struct nlattr *rt = attrs[XFRMA_SEC_CTX]; struct xfrm_user_sec_ctx *uctx; if (!rt) return 0; uctx = nla_data(rt); if (uctx->len > nla_len(rt) || uctx->len != (sizeof(struct xfrm_user_sec_ctx) + uctx->ctx_len)) { NL_SET_ERR_MSG(extack, "Invalid security context length"); return -EINVAL; } return 0; } static inline int verify_replay(struct xfrm_usersa_info *p, struct nlattr **attrs, struct netlink_ext_ack *extack) { struct nlattr *rt = attrs[XFRMA_REPLAY_ESN_VAL]; struct xfrm_replay_state_esn *rs; if (!rt) { if (p->flags & XFRM_STATE_ESN) { NL_SET_ERR_MSG(extack, "Missing required attribute for ESN"); return -EINVAL; } return 0; } rs = nla_data(rt); if (rs->bmp_len > XFRMA_REPLAY_ESN_MAX / sizeof(rs->bmp[0]) / 8) { NL_SET_ERR_MSG(extack, "ESN bitmap length must be <= 128"); return -EINVAL; } if (nla_len(rt) < (int)xfrm_replay_state_esn_len(rs) && nla_len(rt) != sizeof(*rs)) { NL_SET_ERR_MSG(extack, "ESN attribute is too short to fit the full bitmap length"); return -EINVAL; } /* As only ESP and AH support ESN feature. */ if ((p->id.proto != IPPROTO_ESP) && (p->id.proto != IPPROTO_AH)) { NL_SET_ERR_MSG(extack, "ESN only supported for ESP and AH"); return -EINVAL; } if (p->replay_window != 0) { NL_SET_ERR_MSG(extack, "ESN not compatible with legacy replay_window"); return -EINVAL; } return 0; } static int verify_newsa_info(struct xfrm_usersa_info *p, struct nlattr **attrs, struct netlink_ext_ack *extack) { int err; err = -EINVAL; switch (p->family) { case AF_INET: break; case AF_INET6: #if IS_ENABLED(CONFIG_IPV6) break; #else err = -EAFNOSUPPORT; NL_SET_ERR_MSG(extack, "IPv6 support disabled"); goto out; #endif default: NL_SET_ERR_MSG(extack, "Invalid address family"); goto out; } switch (p->sel.family) { case AF_UNSPEC: break; case AF_INET: if (p->sel.prefixlen_d > 32 || p->sel.prefixlen_s > 32) { NL_SET_ERR_MSG(extack, "Invalid prefix length in selector (must be <= 32 for IPv4)"); goto out; } break; case AF_INET6: #if IS_ENABLED(CONFIG_IPV6) if (p->sel.prefixlen_d > 128 || p->sel.prefixlen_s > 128) { NL_SET_ERR_MSG(extack, "Invalid prefix length in selector (must be <= 128 for IPv6)"); goto out; } break; #else NL_SET_ERR_MSG(extack, "IPv6 support disabled"); err = -EAFNOSUPPORT; goto out; #endif default: NL_SET_ERR_MSG(extack, "Invalid address family in selector"); goto out; } err = -EINVAL; switch (p->id.proto) { case IPPROTO_AH: if (!attrs[XFRMA_ALG_AUTH] && !attrs[XFRMA_ALG_AUTH_TRUNC]) { NL_SET_ERR_MSG(extack, "Missing required attribute for AH: AUTH_TRUNC or AUTH"); goto out; } if (attrs[XFRMA_ALG_AEAD] || attrs[XFRMA_ALG_CRYPT] || attrs[XFRMA_ALG_COMP] || attrs[XFRMA_TFCPAD]) { NL_SET_ERR_MSG(extack, "Invalid attributes for AH: AEAD, CRYPT, COMP, TFCPAD"); goto out; } break; case IPPROTO_ESP: if (attrs[XFRMA_ALG_COMP]) { NL_SET_ERR_MSG(extack, "Invalid attribute for ESP: COMP"); goto out; } if (!attrs[XFRMA_ALG_AUTH] && !attrs[XFRMA_ALG_AUTH_TRUNC] && !attrs[XFRMA_ALG_CRYPT] && !attrs[XFRMA_ALG_AEAD]) { NL_SET_ERR_MSG(extack, "Missing required attribute for ESP: at least one of AUTH, AUTH_TRUNC, CRYPT, AEAD"); goto out; } if ((attrs[XFRMA_ALG_AUTH] || attrs[XFRMA_ALG_AUTH_TRUNC] || attrs[XFRMA_ALG_CRYPT]) && attrs[XFRMA_ALG_AEAD]) { NL_SET_ERR_MSG(extack, "Invalid attribute combination for ESP: AEAD can't be used with AUTH, AUTH_TRUNC, CRYPT"); goto out; } if (attrs[XFRMA_TFCPAD] && p->mode != XFRM_MODE_TUNNEL) { NL_SET_ERR_MSG(extack, "TFC padding can only be used in tunnel mode"); goto out; } break; case IPPROTO_COMP: if (!attrs[XFRMA_ALG_COMP]) { NL_SET_ERR_MSG(extack, "Missing required attribute for COMP: COMP"); goto out; } if (attrs[XFRMA_ALG_AEAD] || attrs[XFRMA_ALG_AUTH] || attrs[XFRMA_ALG_AUTH_TRUNC] || attrs[XFRMA_ALG_CRYPT] || attrs[XFRMA_TFCPAD]) { NL_SET_ERR_MSG(extack, "Invalid attributes for COMP: AEAD, AUTH, AUTH_TRUNC, CRYPT, TFCPAD"); goto out; } if (ntohl(p->id.spi) >= 0x10000) { NL_SET_ERR_MSG(extack, "SPI is too large for COMP (must be < 0x10000)"); goto out; } break; #if IS_ENABLED(CONFIG_IPV6) case IPPROTO_DSTOPTS: case IPPROTO_ROUTING: if (attrs[XFRMA_ALG_COMP] || attrs[XFRMA_ALG_AUTH] || attrs[XFRMA_ALG_AUTH_TRUNC] || attrs[XFRMA_ALG_AEAD] || attrs[XFRMA_ALG_CRYPT] || attrs[XFRMA_ENCAP] || attrs[XFRMA_SEC_CTX] || attrs[XFRMA_TFCPAD]) { NL_SET_ERR_MSG(extack, "Invalid attributes for DSTOPTS/ROUTING"); goto out; } if (!attrs[XFRMA_COADDR]) { NL_SET_ERR_MSG(extack, "Missing required COADDR attribute for DSTOPTS/ROUTING"); goto out; } break; #endif default: NL_SET_ERR_MSG(extack, "Unsupported protocol"); goto out; } if ((err = verify_aead(attrs, extack))) goto out; if ((err = verify_auth_trunc(attrs, extack))) goto out; if ((err = verify_one_alg(attrs, XFRMA_ALG_AUTH, extack))) goto out; if ((err = verify_one_alg(attrs, XFRMA_ALG_CRYPT, extack))) goto out; if ((err = verify_one_alg(attrs, XFRMA_ALG_COMP, extack))) goto out; if ((err = verify_sec_ctx_len(attrs, extack))) goto out; if ((err = verify_replay(p, attrs, extack))) goto out; err = -EINVAL; switch (p->mode) { case XFRM_MODE_TRANSPORT: case XFRM_MODE_TUNNEL: case XFRM_MODE_ROUTEOPTIMIZATION: case XFRM_MODE_BEET: break; default: NL_SET_ERR_MSG(extack, "Unsupported mode"); goto out; } err = 0; if (attrs[XFRMA_MTIMER_THRESH]) { if (!attrs[XFRMA_ENCAP]) { NL_SET_ERR_MSG(extack, "MTIMER_THRESH attribute can only be set on ENCAP states"); err = -EINVAL; goto out; } } out: return err; } static int attach_one_algo(struct xfrm_algo **algpp, u8 *props, struct xfrm_algo_desc *(*get_byname)(const char *, int), struct nlattr *rta, struct netlink_ext_ack *extack) { struct xfrm_algo *p, *ualg; struct xfrm_algo_desc *algo; if (!rta) return 0; ualg = nla_data(rta); algo = get_byname(ualg->alg_name, 1); if (!algo) { NL_SET_ERR_MSG(extack, "Requested COMP algorithm not found"); return -ENOSYS; } *props = algo->desc.sadb_alg_id; p = kmemdup(ualg, xfrm_alg_len(ualg), GFP_KERNEL); if (!p) return -ENOMEM; strcpy(p->alg_name, algo->name); *algpp = p; return 0; } static int attach_crypt(struct xfrm_state *x, struct nlattr *rta, struct netlink_ext_ack *extack) { struct xfrm_algo *p, *ualg; struct xfrm_algo_desc *algo; if (!rta) return 0; ualg = nla_data(rta); algo = xfrm_ealg_get_byname(ualg->alg_name, 1); if (!algo) { NL_SET_ERR_MSG(extack, "Requested CRYPT algorithm not found"); return -ENOSYS; } x->props.ealgo = algo->desc.sadb_alg_id; p = kmemdup(ualg, xfrm_alg_len(ualg), GFP_KERNEL); if (!p) return -ENOMEM; strcpy(p->alg_name, algo->name); x->ealg = p; x->geniv = algo->uinfo.encr.geniv; return 0; } static int attach_auth(struct xfrm_algo_auth **algpp, u8 *props, struct nlattr *rta, struct netlink_ext_ack *extack) { struct xfrm_algo *ualg; struct xfrm_algo_auth *p; struct xfrm_algo_desc *algo; if (!rta) return 0; ualg = nla_data(rta); algo = xfrm_aalg_get_byname(ualg->alg_name, 1); if (!algo) { NL_SET_ERR_MSG(extack, "Requested AUTH algorithm not found"); return -ENOSYS; } *props = algo->desc.sadb_alg_id; p = kmalloc(sizeof(*p) + (ualg->alg_key_len + 7) / 8, GFP_KERNEL); if (!p) return -ENOMEM; strcpy(p->alg_name, algo->name); p->alg_key_len = ualg->alg_key_len; p->alg_trunc_len = algo->uinfo.auth.icv_truncbits; memcpy(p->alg_key, ualg->alg_key, (ualg->alg_key_len + 7) / 8); *algpp = p; return 0; } static int attach_auth_trunc(struct xfrm_algo_auth **algpp, u8 *props, struct nlattr *rta, struct netlink_ext_ack *extack) { struct xfrm_algo_auth *p, *ualg; struct xfrm_algo_desc *algo; if (!rta) return 0; ualg = nla_data(rta); algo = xfrm_aalg_get_byname(ualg->alg_name, 1); if (!algo) { NL_SET_ERR_MSG(extack, "Requested AUTH_TRUNC algorithm not found"); return -ENOSYS; } if (ualg->alg_trunc_len > algo->uinfo.auth.icv_fullbits) { NL_SET_ERR_MSG(extack, "Invalid length requested for truncated ICV"); return -EINVAL; } *props = algo->desc.sadb_alg_id; p = kmemdup(ualg, xfrm_alg_auth_len(ualg), GFP_KERNEL); if (!p) return -ENOMEM; strcpy(p->alg_name, algo->name); if (!p->alg_trunc_len) p->alg_trunc_len = algo->uinfo.auth.icv_truncbits; *algpp = p; return 0; } static int attach_aead(struct xfrm_state *x, struct nlattr *rta, struct netlink_ext_ack *extack) { struct xfrm_algo_aead *p, *ualg; struct xfrm_algo_desc *algo; if (!rta) return 0; ualg = nla_data(rta); algo = xfrm_aead_get_byname(ualg->alg_name, ualg->alg_icv_len, 1); if (!algo) { NL_SET_ERR_MSG(extack, "Requested AEAD algorithm not found"); return -ENOSYS; } x->props.ealgo = algo->desc.sadb_alg_id; p = kmemdup(ualg, aead_len(ualg), GFP_KERNEL); if (!p) return -ENOMEM; strcpy(p->alg_name, algo->name); x->aead = p; x->geniv = algo->uinfo.aead.geniv; return 0; } static inline int xfrm_replay_verify_len(struct xfrm_replay_state_esn *replay_esn, struct nlattr *rp, struct netlink_ext_ack *extack) { struct xfrm_replay_state_esn *up; unsigned int ulen; if (!replay_esn || !rp) return 0; up = nla_data(rp); ulen = xfrm_replay_state_esn_len(up); /* Check the overall length and the internal bitmap length to avoid * potential overflow. */ if (nla_len(rp) < (int)ulen) { NL_SET_ERR_MSG(extack, "ESN attribute is too short"); return -EINVAL; } if (xfrm_replay_state_esn_len(replay_esn) != ulen) { NL_SET_ERR_MSG(extack, "New ESN size doesn't match the existing SA's ESN size"); return -EINVAL; } if (replay_esn->bmp_len != up->bmp_len) { NL_SET_ERR_MSG(extack, "New ESN bitmap size doesn't match the existing SA's ESN bitmap"); return -EINVAL; } if (up->replay_window > up->bmp_len * sizeof(__u32) * 8) { NL_SET_ERR_MSG(extack, "ESN replay window is longer than the bitmap"); return -EINVAL; } return 0; } static int xfrm_alloc_replay_state_esn(struct xfrm_replay_state_esn **replay_esn, struct xfrm_replay_state_esn **preplay_esn, struct nlattr *rta) { struct xfrm_replay_state_esn *p, *pp, *up; unsigned int klen, ulen; if (!rta) return 0; up = nla_data(rta); klen = xfrm_replay_state_esn_len(up); ulen = nla_len(rta) >= (int)klen ? klen : sizeof(*up); p = kzalloc(klen, GFP_KERNEL); if (!p) return -ENOMEM; pp = kzalloc(klen, GFP_KERNEL); if (!pp) { kfree(p); return -ENOMEM; } memcpy(p, up, ulen); memcpy(pp, up, ulen); *replay_esn = p; *preplay_esn = pp; return 0; } static inline unsigned int xfrm_user_sec_ctx_size(struct xfrm_sec_ctx *xfrm_ctx) { unsigned int len = 0; if (xfrm_ctx) { len += sizeof(struct xfrm_user_sec_ctx); len += xfrm_ctx->ctx_len; } return len; } static void copy_from_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p) { memcpy(&x->id, &p->id, sizeof(x->id)); memcpy(&x->sel, &p->sel, sizeof(x->sel)); memcpy(&x->lft, &p->lft, sizeof(x->lft)); x->props.mode = p->mode; x->props.replay_window = min_t(unsigned int, p->replay_window, sizeof(x->replay.bitmap) * 8); x->props.reqid = p->reqid; x->props.family = p->family; memcpy(&x->props.saddr, &p->saddr, sizeof(x->props.saddr)); x->props.flags = p->flags; if (!x->sel.family && !(p->flags & XFRM_STATE_AF_UNSPEC)) x->sel.family = p->family; } /* * someday when pfkey also has support, we could have the code * somehow made shareable and move it to xfrm_state.c - JHS * */ static void xfrm_update_ae_params(struct xfrm_state *x, struct nlattr **attrs, int update_esn) { struct nlattr *rp = attrs[XFRMA_REPLAY_VAL]; struct nlattr *re = update_esn ? attrs[XFRMA_REPLAY_ESN_VAL] : NULL; struct nlattr *lt = attrs[XFRMA_LTIME_VAL]; struct nlattr *et = attrs[XFRMA_ETIMER_THRESH]; struct nlattr *rt = attrs[XFRMA_REPLAY_THRESH]; struct nlattr *mt = attrs[XFRMA_MTIMER_THRESH]; if (re && x->replay_esn && x->preplay_esn) { struct xfrm_replay_state_esn *replay_esn; replay_esn = nla_data(re); memcpy(x->replay_esn, replay_esn, xfrm_replay_state_esn_len(replay_esn)); memcpy(x->preplay_esn, replay_esn, xfrm_replay_state_esn_len(replay_esn)); } if (rp) { struct xfrm_replay_state *replay; replay = nla_data(rp); memcpy(&x->replay, replay, sizeof(*replay)); memcpy(&x->preplay, replay, sizeof(*replay)); } if (lt) { struct xfrm_lifetime_cur *ltime; ltime = nla_data(lt); x->curlft.bytes = ltime->bytes; x->curlft.packets = ltime->packets; x->curlft.add_time = ltime->add_time; x->curlft.use_time = ltime->use_time; } if (et) x->replay_maxage = nla_get_u32(et); if (rt) x->replay_maxdiff = nla_get_u32(rt); if (mt) x->mapping_maxage = nla_get_u32(mt); } static void xfrm_smark_init(struct nlattr **attrs, struct xfrm_mark *m) { if (attrs[XFRMA_SET_MARK]) { m->v = nla_get_u32(attrs[XFRMA_SET_MARK]); if (attrs[XFRMA_SET_MARK_MASK]) m->m = nla_get_u32(attrs[XFRMA_SET_MARK_MASK]); else m->m = 0xffffffff; } else { m->v = m->m = 0; } } static struct xfrm_state *xfrm_state_construct(struct net *net, struct xfrm_usersa_info *p, struct nlattr **attrs, int *errp, struct netlink_ext_ack *extack) { struct xfrm_state *x = xfrm_state_alloc(net); int err = -ENOMEM; if (!x) goto error_no_put; copy_from_user_state(x, p); if (attrs[XFRMA_ENCAP]) { x->encap = kmemdup(nla_data(attrs[XFRMA_ENCAP]), sizeof(*x->encap), GFP_KERNEL); if (x->encap == NULL) goto error; } if (attrs[XFRMA_COADDR]) { x->coaddr = kmemdup(nla_data(attrs[XFRMA_COADDR]), sizeof(*x->coaddr), GFP_KERNEL); if (x->coaddr == NULL) goto error; } if (attrs[XFRMA_SA_EXTRA_FLAGS]) x->props.extra_flags = nla_get_u32(attrs[XFRMA_SA_EXTRA_FLAGS]); if ((err = attach_aead(x, attrs[XFRMA_ALG_AEAD], extack))) goto error; if ((err = attach_auth_trunc(&x->aalg, &x->props.aalgo, attrs[XFRMA_ALG_AUTH_TRUNC], extack))) goto error; if (!x->props.aalgo) { if ((err = attach_auth(&x->aalg, &x->props.aalgo, attrs[XFRMA_ALG_AUTH], extack))) goto error; } if ((err = attach_crypt(x, attrs[XFRMA_ALG_CRYPT], extack))) goto error; if ((err = attach_one_algo(&x->calg, &x->props.calgo, xfrm_calg_get_byname, attrs[XFRMA_ALG_COMP], extack))) goto error; if (attrs[XFRMA_TFCPAD]) x->tfcpad = nla_get_u32(attrs[XFRMA_TFCPAD]); xfrm_mark_get(attrs, &x->mark); xfrm_smark_init(attrs, &x->props.smark); if (attrs[XFRMA_IF_ID]) x->if_id = nla_get_u32(attrs[XFRMA_IF_ID]); err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV], extack); if (err) goto error; if (attrs[XFRMA_SEC_CTX]) { err = security_xfrm_state_alloc(x, nla_data(attrs[XFRMA_SEC_CTX])); if (err) goto error; } if ((err = xfrm_alloc_replay_state_esn(&x->replay_esn, &x->preplay_esn, attrs[XFRMA_REPLAY_ESN_VAL]))) goto error; x->km.seq = p->seq; x->replay_maxdiff = net->xfrm.sysctl_aevent_rseqth; /* sysctl_xfrm_aevent_etime is in 100ms units */ x->replay_maxage = (net->xfrm.sysctl_aevent_etime*HZ)/XFRM_AE_ETH_M; if ((err = xfrm_init_replay(x, extack))) goto error; /* override default values from above */ xfrm_update_ae_params(x, attrs, 0); /* configure the hardware if offload is requested */ if (attrs[XFRMA_OFFLOAD_DEV]) { err = xfrm_dev_state_add(net, x, nla_data(attrs[XFRMA_OFFLOAD_DEV]), extack); if (err) goto error; } return x; error: x->km.state = XFRM_STATE_DEAD; xfrm_state_put(x); error_no_put: *errp = err; return NULL; } static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **attrs, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct xfrm_usersa_info *p = nlmsg_data(nlh); struct xfrm_state *x; int err; struct km_event c; err = verify_newsa_info(p, attrs, extack); if (err) return err; x = xfrm_state_construct(net, p, attrs, &err, extack); if (!x) return err; xfrm_state_hold(x); if (nlh->nlmsg_type == XFRM_MSG_NEWSA) err = xfrm_state_add(x); else err = xfrm_state_update(x); xfrm_audit_state_add(x, err ? 0 : 1, true); if (err < 0) { x->km.state = XFRM_STATE_DEAD; xfrm_dev_state_delete(x); __xfrm_state_put(x); goto out; } if (x->km.state == XFRM_STATE_VOID) x->km.state = XFRM_STATE_VALID; c.seq = nlh->nlmsg_seq; c.portid = nlh->nlmsg_pid; c.event = nlh->nlmsg_type; km_state_notify(x, &c); out: xfrm_state_put(x); return err; } static struct xfrm_state *xfrm_user_state_lookup(struct net *net, struct xfrm_usersa_id *p, struct nlattr **attrs, int *errp) { struct xfrm_state *x = NULL; struct xfrm_mark m; int err; u32 mark = xfrm_mark_get(attrs, &m); if (xfrm_id_proto_match(p->proto, IPSEC_PROTO_ANY)) { err = -ESRCH; x = xfrm_state_lookup(net, mark, &p->daddr, p->spi, p->proto, p->family); } else { xfrm_address_t *saddr = NULL; verify_one_addr(attrs, XFRMA_SRCADDR, &saddr); if (!saddr) { err = -EINVAL; goto out; } err = -ESRCH; x = xfrm_state_lookup_byaddr(net, mark, &p->daddr, saddr, p->proto, p->family); } out: if (!x && errp) *errp = err; return x; } static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **attrs, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct xfrm_state *x; int err = -ESRCH; struct km_event c; struct xfrm_usersa_id *p = nlmsg_data(nlh); x = xfrm_user_state_lookup(net, p, attrs, &err); if (x == NULL) return err; if ((err = security_xfrm_state_delete(x)) != 0) goto out; if (xfrm_state_kern(x)) { NL_SET_ERR_MSG(extack, "SA is in use by tunnels"); err = -EPERM; goto out; } err = xfrm_state_delete(x); if (err < 0) goto out; c.seq = nlh->nlmsg_seq; c.portid = nlh->nlmsg_pid; c.event = nlh->nlmsg_type; km_state_notify(x, &c); out: xfrm_audit_state_delete(x, err ? 0 : 1, true); xfrm_state_put(x); return err; } static void copy_to_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p) { memset(p, 0, sizeof(*p)); memcpy(&p->id, &x->id, sizeof(p->id)); memcpy(&p->sel, &x->sel, sizeof(p->sel)); memcpy(&p->lft, &x->lft, sizeof(p->lft)); if (x->xso.dev) xfrm_dev_state_update_curlft(x); memcpy(&p->curlft, &x->curlft, sizeof(p->curlft)); put_unaligned(x->stats.replay_window, &p->stats.replay_window); put_unaligned(x->stats.replay, &p->stats.replay); put_unaligned(x->stats.integrity_failed, &p->stats.integrity_failed); memcpy(&p->saddr, &x->props.saddr, sizeof(p->saddr)); p->mode = x->props.mode; p->replay_window = x->props.replay_window; p->reqid = x->props.reqid; p->family = x->props.family; p->flags = x->props.flags; p->seq = x->km.seq; } struct xfrm_dump_info { struct sk_buff *in_skb; struct sk_buff *out_skb; u32 nlmsg_seq; u16 nlmsg_flags; }; static int copy_sec_ctx(struct xfrm_sec_ctx *s, struct sk_buff *skb) { struct xfrm_user_sec_ctx *uctx; struct nlattr *attr; int ctx_size = sizeof(*uctx) + s->ctx_len; attr = nla_reserve(skb, XFRMA_SEC_CTX, ctx_size); if (attr == NULL) return -EMSGSIZE; uctx = nla_data(attr); uctx->exttype = XFRMA_SEC_CTX; uctx->len = ctx_size; uctx->ctx_doi = s->ctx_doi; uctx->ctx_alg = s->ctx_alg; uctx->ctx_len = s->ctx_len; memcpy(uctx + 1, s->ctx_str, s->ctx_len); return 0; } static int copy_user_offload(struct xfrm_dev_offload *xso, struct sk_buff *skb) { struct xfrm_user_offload *xuo; struct nlattr *attr; attr = nla_reserve(skb, XFRMA_OFFLOAD_DEV, sizeof(*xuo)); if (attr == NULL) return -EMSGSIZE; xuo = nla_data(attr); memset(xuo, 0, sizeof(*xuo)); xuo->ifindex = xso->dev->ifindex; if (xso->dir == XFRM_DEV_OFFLOAD_IN) xuo->flags = XFRM_OFFLOAD_INBOUND; if (xso->type == XFRM_DEV_OFFLOAD_PACKET) xuo->flags |= XFRM_OFFLOAD_PACKET; return 0; } static bool xfrm_redact(void) { return IS_ENABLED(CONFIG_SECURITY) && security_locked_down(LOCKDOWN_XFRM_SECRET); } static int copy_to_user_auth(struct xfrm_algo_auth *auth, struct sk_buff *skb) { struct xfrm_algo *algo; struct xfrm_algo_auth *ap; struct nlattr *nla; bool redact_secret = xfrm_redact(); nla = nla_reserve(skb, XFRMA_ALG_AUTH, sizeof(*algo) + (auth->alg_key_len + 7) / 8); if (!nla) return -EMSGSIZE; algo = nla_data(nla); strncpy(algo->alg_name, auth->alg_name, sizeof(algo->alg_name)); if (redact_secret && auth->alg_key_len) memset(algo->alg_key, 0, (auth->alg_key_len + 7) / 8); else memcpy(algo->alg_key, auth->alg_key, (auth->alg_key_len + 7) / 8); algo->alg_key_len = auth->alg_key_len; nla = nla_reserve(skb, XFRMA_ALG_AUTH_TRUNC, xfrm_alg_auth_len(auth)); if (!nla) return -EMSGSIZE; ap = nla_data(nla); memcpy(ap, auth, sizeof(struct xfrm_algo_auth)); if (redact_secret && auth->alg_key_len) memset(ap->alg_key, 0, (auth->alg_key_len + 7) / 8); else memcpy(ap->alg_key, auth->alg_key, (auth->alg_key_len + 7) / 8); return 0; } static int copy_to_user_aead(struct xfrm_algo_aead *aead, struct sk_buff *skb) { struct nlattr *nla = nla_reserve(skb, XFRMA_ALG_AEAD, aead_len(aead)); struct xfrm_algo_aead *ap; bool redact_secret = xfrm_redact(); if (!nla) return -EMSGSIZE; ap = nla_data(nla); strscpy_pad(ap->alg_name, aead->alg_name, sizeof(ap->alg_name)); ap->alg_key_len = aead->alg_key_len; ap->alg_icv_len = aead->alg_icv_len; if (redact_secret && aead->alg_key_len) memset(ap->alg_key, 0, (aead->alg_key_len + 7) / 8); else memcpy(ap->alg_key, aead->alg_key, (aead->alg_key_len + 7) / 8); return 0; } static int copy_to_user_ealg(struct xfrm_algo *ealg, struct sk_buff *skb) { struct xfrm_algo *ap; bool redact_secret = xfrm_redact(); struct nlattr *nla = nla_reserve(skb, XFRMA_ALG_CRYPT, xfrm_alg_len(ealg)); if (!nla) return -EMSGSIZE; ap = nla_data(nla); strscpy_pad(ap->alg_name, ealg->alg_name, sizeof(ap->alg_name)); ap->alg_key_len = ealg->alg_key_len; if (redact_secret && ealg->alg_key_len) memset(ap->alg_key, 0, (ealg->alg_key_len + 7) / 8); else memcpy(ap->alg_key, ealg->alg_key, (ealg->alg_key_len + 7) / 8); return 0; } static int copy_to_user_calg(struct xfrm_algo *calg, struct sk_buff *skb) { struct nlattr *nla = nla_reserve(skb, XFRMA_ALG_COMP, sizeof(*calg)); struct xfrm_algo *ap; if (!nla) return -EMSGSIZE; ap = nla_data(nla); strscpy_pad(ap->alg_name, calg->alg_name, sizeof(ap->alg_name)); ap->alg_key_len = 0; return 0; } static int copy_to_user_encap(struct xfrm_encap_tmpl *ep, struct sk_buff *skb) { struct nlattr *nla = nla_reserve(skb, XFRMA_ENCAP, sizeof(*ep)); struct xfrm_encap_tmpl *uep; if (!nla) return -EMSGSIZE; uep = nla_data(nla); memset(uep, 0, sizeof(*uep)); uep->encap_type = ep->encap_type; uep->encap_sport = ep->encap_sport; uep->encap_dport = ep->encap_dport; uep->encap_oa = ep->encap_oa; return 0; } static int xfrm_smark_put(struct sk_buff *skb, struct xfrm_mark *m) { int ret = 0; if (m->v | m->m) { ret = nla_put_u32(skb, XFRMA_SET_MARK, m->v); if (!ret) ret = nla_put_u32(skb, XFRMA_SET_MARK_MASK, m->m); } return ret; } /* Don't change this without updating xfrm_sa_len! */ static int copy_to_user_state_extra(struct xfrm_state *x, struct xfrm_usersa_info *p, struct sk_buff *skb) { int ret = 0; copy_to_user_state(x, p); if (x->props.extra_flags) { ret = nla_put_u32(skb, XFRMA_SA_EXTRA_FLAGS, x->props.extra_flags); if (ret) goto out; } if (x->coaddr) { ret = nla_put(skb, XFRMA_COADDR, sizeof(*x->coaddr), x->coaddr); if (ret) goto out; } if (x->lastused) { ret = nla_put_u64_64bit(skb, XFRMA_LASTUSED, x->lastused, XFRMA_PAD); if (ret) goto out; } if (x->aead) { ret = copy_to_user_aead(x->aead, skb); if (ret) goto out; } if (x->aalg) { ret = copy_to_user_auth(x->aalg, skb); if (ret) goto out; } if (x->ealg) { ret = copy_to_user_ealg(x->ealg, skb); if (ret) goto out; } if (x->calg) { ret = copy_to_user_calg(x->calg, skb); if (ret) goto out; } if (x->encap) { ret = copy_to_user_encap(x->encap, skb); if (ret) goto out; } if (x->tfcpad) { ret = nla_put_u32(skb, XFRMA_TFCPAD, x->tfcpad); if (ret) goto out; } ret = xfrm_mark_put(skb, &x->mark); if (ret) goto out; ret = xfrm_smark_put(skb, &x->props.smark); if (ret) goto out; if (x->replay_esn) ret = nla_put(skb, XFRMA_REPLAY_ESN_VAL, xfrm_replay_state_esn_len(x->replay_esn), x->replay_esn); else ret = nla_put(skb, XFRMA_REPLAY_VAL, sizeof(x->replay), &x->replay); if (ret) goto out; if(x->xso.dev) ret = copy_user_offload(&x->xso, skb); if (ret) goto out; if (x->if_id) { ret = nla_put_u32(skb, XFRMA_IF_ID, x->if_id); if (ret) goto out; } if (x->security) { ret = copy_sec_ctx(x->security, skb); if (ret) goto out; } if (x->mapping_maxage) ret = nla_put_u32(skb, XFRMA_MTIMER_THRESH, x->mapping_maxage); out: return ret; } static int dump_one_state(struct xfrm_state *x, int count, void *ptr) { struct xfrm_dump_info *sp = ptr; struct sk_buff *in_skb = sp->in_skb; struct sk_buff *skb = sp->out_skb; struct xfrm_translator *xtr; struct xfrm_usersa_info *p; struct nlmsghdr *nlh; int err; nlh = nlmsg_put(skb, NETLINK_CB(in_skb).portid, sp->nlmsg_seq, XFRM_MSG_NEWSA, sizeof(*p), sp->nlmsg_flags); if (nlh == NULL) return -EMSGSIZE; p = nlmsg_data(nlh); err = copy_to_user_state_extra(x, p, skb); if (err) { nlmsg_cancel(skb, nlh); return err; } nlmsg_end(skb, nlh); xtr = xfrm_get_translator(); if (xtr) { err = xtr->alloc_compat(skb, nlh); xfrm_put_translator(xtr); if (err) { nlmsg_cancel(skb, nlh); return err; } } return 0; } static int xfrm_dump_sa_done(struct netlink_callback *cb) { struct xfrm_state_walk *walk = (struct xfrm_state_walk *) &cb->args[1]; struct sock *sk = cb->skb->sk; struct net *net = sock_net(sk); if (cb->args[0]) xfrm_state_walk_done(walk, net); return 0; } static int xfrm_dump_sa(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct xfrm_state_walk *walk = (struct xfrm_state_walk *) &cb->args[1]; struct xfrm_dump_info info; BUILD_BUG_ON(sizeof(struct xfrm_state_walk) > sizeof(cb->args) - sizeof(cb->args[0])); info.in_skb = cb->skb; info.out_skb = skb; info.nlmsg_seq = cb->nlh->nlmsg_seq; info.nlmsg_flags = NLM_F_MULTI; if (!cb->args[0]) { struct nlattr *attrs[XFRMA_MAX+1]; struct xfrm_address_filter *filter = NULL; u8 proto = 0; int err; err = nlmsg_parse_deprecated(cb->nlh, 0, attrs, XFRMA_MAX, xfrma_policy, cb->extack); if (err < 0) return err; if (attrs[XFRMA_ADDRESS_FILTER]) { filter = kmemdup(nla_data(attrs[XFRMA_ADDRESS_FILTER]), sizeof(*filter), GFP_KERNEL); if (filter == NULL) return -ENOMEM; /* see addr_match(), (prefix length >> 5) << 2 * will be used to compare xfrm_address_t */ if (filter->splen > (sizeof(xfrm_address_t) << 3) || filter->dplen > (sizeof(xfrm_address_t) << 3)) { kfree(filter); return -EINVAL; } } if (attrs[XFRMA_PROTO]) proto = nla_get_u8(attrs[XFRMA_PROTO]); xfrm_state_walk_init(walk, proto, filter); cb->args[0] = 1; } (void) xfrm_state_walk(net, walk, dump_one_state, &info); return skb->len; } static struct sk_buff *xfrm_state_netlink(struct sk_buff *in_skb, struct xfrm_state *x, u32 seq) { struct xfrm_dump_info info; struct sk_buff *skb; int err; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!skb) return ERR_PTR(-ENOMEM); info.in_skb = in_skb; info.out_skb = skb; info.nlmsg_seq = seq; info.nlmsg_flags = 0; err = dump_one_state(x, 0, &info); if (err) { kfree_skb(skb); return ERR_PTR(err); } return skb; } /* A wrapper for nlmsg_multicast() checking that nlsk is still available. * Must be called with RCU read lock. */ static inline int xfrm_nlmsg_multicast(struct net *net, struct sk_buff *skb, u32 pid, unsigned int group) { struct sock *nlsk = rcu_dereference(net->xfrm.nlsk); struct xfrm_translator *xtr; if (!nlsk) { kfree_skb(skb); return -EPIPE; } xtr = xfrm_get_translator(); if (xtr) { int err = xtr->alloc_compat(skb, nlmsg_hdr(skb)); xfrm_put_translator(xtr); if (err) { kfree_skb(skb); return err; } } return nlmsg_multicast(nlsk, skb, pid, group, GFP_ATOMIC); } static inline unsigned int xfrm_spdinfo_msgsize(void) { return NLMSG_ALIGN(4) + nla_total_size(sizeof(struct xfrmu_spdinfo)) + nla_total_size(sizeof(struct xfrmu_spdhinfo)) + nla_total_size(sizeof(struct xfrmu_spdhthresh)) + nla_total_size(sizeof(struct xfrmu_spdhthresh)); } static int build_spdinfo(struct sk_buff *skb, struct net *net, u32 portid, u32 seq, u32 flags) { struct xfrmk_spdinfo si; struct xfrmu_spdinfo spc; struct xfrmu_spdhinfo sph; struct xfrmu_spdhthresh spt4, spt6; struct nlmsghdr *nlh; int err; u32 *f; unsigned lseq; nlh = nlmsg_put(skb, portid, seq, XFRM_MSG_NEWSPDINFO, sizeof(u32), 0); if (nlh == NULL) /* shouldn't really happen ... */ return -EMSGSIZE; f = nlmsg_data(nlh); *f = flags; xfrm_spd_getinfo(net, &si); spc.incnt = si.incnt; spc.outcnt = si.outcnt; spc.fwdcnt = si.fwdcnt; spc.inscnt = si.inscnt; spc.outscnt = si.outscnt; spc.fwdscnt = si.fwdscnt; sph.spdhcnt = si.spdhcnt; sph.spdhmcnt = si.spdhmcnt; do { lseq = read_seqbegin(&net->xfrm.policy_hthresh.lock); spt4.lbits = net->xfrm.policy_hthresh.lbits4; spt4.rbits = net->xfrm.policy_hthresh.rbits4; spt6.lbits = net->xfrm.policy_hthresh.lbits6; spt6.rbits = net->xfrm.policy_hthresh.rbits6; } while (read_seqretry(&net->xfrm.policy_hthresh.lock, lseq)); err = nla_put(skb, XFRMA_SPD_INFO, sizeof(spc), &spc); if (!err) err = nla_put(skb, XFRMA_SPD_HINFO, sizeof(sph), &sph); if (!err) err = nla_put(skb, XFRMA_SPD_IPV4_HTHRESH, sizeof(spt4), &spt4); if (!err) err = nla_put(skb, XFRMA_SPD_IPV6_HTHRESH, sizeof(spt6), &spt6); if (err) { nlmsg_cancel(skb, nlh); return err; } nlmsg_end(skb, nlh); return 0; } static int xfrm_set_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **attrs, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct xfrmu_spdhthresh *thresh4 = NULL; struct xfrmu_spdhthresh *thresh6 = NULL; /* selector prefixlen thresholds to hash policies */ if (attrs[XFRMA_SPD_IPV4_HTHRESH]) { struct nlattr *rta = attrs[XFRMA_SPD_IPV4_HTHRESH]; if (nla_len(rta) < sizeof(*thresh4)) { NL_SET_ERR_MSG(extack, "Invalid SPD_IPV4_HTHRESH attribute length"); return -EINVAL; } thresh4 = nla_data(rta); if (thresh4->lbits > 32 || thresh4->rbits > 32) { NL_SET_ERR_MSG(extack, "Invalid hash threshold (must be <= 32 for IPv4)"); return -EINVAL; } } if (attrs[XFRMA_SPD_IPV6_HTHRESH]) { struct nlattr *rta = attrs[XFRMA_SPD_IPV6_HTHRESH]; if (nla_len(rta) < sizeof(*thresh6)) { NL_SET_ERR_MSG(extack, "Invalid SPD_IPV6_HTHRESH attribute length"); return -EINVAL; } thresh6 = nla_data(rta); if (thresh6->lbits > 128 || thresh6->rbits > 128) { NL_SET_ERR_MSG(extack, "Invalid hash threshold (must be <= 128 for IPv6)"); return -EINVAL; } } if (thresh4 || thresh6) { write_seqlock(&net->xfrm.policy_hthresh.lock); if (thresh4) { net->xfrm.policy_hthresh.lbits4 = thresh4->lbits; net->xfrm.policy_hthresh.rbits4 = thresh4->rbits; } if (thresh6) { net->xfrm.policy_hthresh.lbits6 = thresh6->lbits; net->xfrm.policy_hthresh.rbits6 = thresh6->rbits; } write_sequnlock(&net->xfrm.policy_hthresh.lock); xfrm_policy_hash_rebuild(net); } return 0; } static int xfrm_get_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **attrs, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct sk_buff *r_skb; u32 *flags = nlmsg_data(nlh); u32 sportid = NETLINK_CB(skb).portid; u32 seq = nlh->nlmsg_seq; int err; r_skb = nlmsg_new(xfrm_spdinfo_msgsize(), GFP_ATOMIC); if (r_skb == NULL) return -ENOMEM; err = build_spdinfo(r_skb, net, sportid, seq, *flags); BUG_ON(err < 0); return nlmsg_unicast(net->xfrm.nlsk, r_skb, sportid); } static inline unsigned int xfrm_sadinfo_msgsize(void) { return NLMSG_ALIGN(4) + nla_total_size(sizeof(struct xfrmu_sadhinfo)) + nla_total_size(4); /* XFRMA_SAD_CNT */ } static int build_sadinfo(struct sk_buff *skb, struct net *net, u32 portid, u32 seq, u32 flags) { struct xfrmk_sadinfo si; struct xfrmu_sadhinfo sh; struct nlmsghdr *nlh; int err; u32 *f; nlh = nlmsg_put(skb, portid, seq, XFRM_MSG_NEWSADINFO, sizeof(u32), 0); if (nlh == NULL) /* shouldn't really happen ... */ return -EMSGSIZE; f = nlmsg_data(nlh); *f = flags; xfrm_sad_getinfo(net, &si); sh.sadhmcnt = si.sadhmcnt; sh.sadhcnt = si.sadhcnt; err = nla_put_u32(skb, XFRMA_SAD_CNT, si.sadcnt); if (!err) err = nla_put(skb, XFRMA_SAD_HINFO, sizeof(sh), &sh); if (err) { nlmsg_cancel(skb, nlh); return err; } nlmsg_end(skb, nlh); return 0; } static int xfrm_get_sadinfo(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **attrs, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct sk_buff *r_skb; u32 *flags = nlmsg_data(nlh); u32 sportid = NETLINK_CB(skb).portid; u32 seq = nlh->nlmsg_seq; int err; r_skb = nlmsg_new(xfrm_sadinfo_msgsize(), GFP_ATOMIC); if (r_skb == NULL) return -ENOMEM; err = build_sadinfo(r_skb, net, sportid, seq, *flags); BUG_ON(err < 0); return nlmsg_unicast(net->xfrm.nlsk, r_skb, sportid); } static int xfrm_get_sa(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **attrs, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct xfrm_usersa_id *p = nlmsg_data(nlh); struct xfrm_state *x; struct sk_buff *resp_skb; int err = -ESRCH; x = xfrm_user_state_lookup(net, p, attrs, &err); if (x == NULL) goto out_noput; resp_skb = xfrm_state_netlink(skb, x, nlh->nlmsg_seq); if (IS_ERR(resp_skb)) { err = PTR_ERR(resp_skb); } else { err = nlmsg_unicast(net->xfrm.nlsk, resp_skb, NETLINK_CB(skb).portid); } xfrm_state_put(x); out_noput: return err; } static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **attrs, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct xfrm_state *x; struct xfrm_userspi_info *p; struct xfrm_translator *xtr; struct sk_buff *resp_skb; xfrm_address_t *daddr; int family; int err; u32 mark; struct xfrm_mark m; u32 if_id = 0; p = nlmsg_data(nlh); err = verify_spi_info(p->info.id.proto, p->min, p->max, extack); if (err) goto out_noput; family = p->info.family; daddr = &p->info.id.daddr; x = NULL; mark = xfrm_mark_get(attrs, &m); if (attrs[XFRMA_IF_ID]) if_id = nla_get_u32(attrs[XFRMA_IF_ID]); if (p->info.seq) { x = xfrm_find_acq_byseq(net, mark, p->info.seq); if (x && !xfrm_addr_equal(&x->id.daddr, daddr, family)) { xfrm_state_put(x); x = NULL; } } if (!x) x = xfrm_find_acq(net, &m, p->info.mode, p->info.reqid, if_id, p->info.id.proto, daddr, &p->info.saddr, 1, family); err = -ENOENT; if (!x) { NL_SET_ERR_MSG(extack, "Target ACQUIRE not found"); goto out_noput; } err = xfrm_alloc_spi(x, p->min, p->max, extack); if (err) goto out; resp_skb = xfrm_state_netlink(skb, x, nlh->nlmsg_seq); if (IS_ERR(resp_skb)) { err = PTR_ERR(resp_skb); goto out; } xtr = xfrm_get_translator(); if (xtr) { err = xtr->alloc_compat(skb, nlmsg_hdr(skb)); xfrm_put_translator(xtr); if (err) { kfree_skb(resp_skb); goto out; } } err = nlmsg_unicast(net->xfrm.nlsk, resp_skb, NETLINK_CB(skb).portid); out: xfrm_state_put(x); out_noput: return err; } static int verify_policy_dir(u8 dir, struct netlink_ext_ack *extack) { switch (dir) { case XFRM_POLICY_IN: case XFRM_POLICY_OUT: case XFRM_POLICY_FWD: break; default: NL_SET_ERR_MSG(extack, "Invalid policy direction"); return -EINVAL; } return 0; } static int verify_policy_type(u8 type, struct netlink_ext_ack *extack) { switch (type) { case XFRM_POLICY_TYPE_MAIN: #ifdef CONFIG_XFRM_SUB_POLICY case XFRM_POLICY_TYPE_SUB: #endif break; default: NL_SET_ERR_MSG(extack, "Invalid policy type"); return -EINVAL; } return 0; } static int verify_newpolicy_info(struct xfrm_userpolicy_info *p, struct netlink_ext_ack *extack) { int ret; switch (p->share) { case XFRM_SHARE_ANY: case XFRM_SHARE_SESSION: case XFRM_SHARE_USER: case XFRM_SHARE_UNIQUE: break; default: NL_SET_ERR_MSG(extack, "Invalid policy share"); return -EINVAL; } switch (p->action) { case XFRM_POLICY_ALLOW: case XFRM_POLICY_BLOCK: break; default: NL_SET_ERR_MSG(extack, "Invalid policy action"); return -EINVAL; } switch (p->sel.family) { case AF_INET: if (p->sel.prefixlen_d > 32 || p->sel.prefixlen_s > 32) { NL_SET_ERR_MSG(extack, "Invalid prefix length in selector (must be <= 32 for IPv4)"); return -EINVAL; } break; case AF_INET6: #if IS_ENABLED(CONFIG_IPV6) if (p->sel.prefixlen_d > 128 || p->sel.prefixlen_s > 128) { NL_SET_ERR_MSG(extack, "Invalid prefix length in selector (must be <= 128 for IPv6)"); return -EINVAL; } break; #else NL_SET_ERR_MSG(extack, "IPv6 support disabled"); return -EAFNOSUPPORT; #endif default: NL_SET_ERR_MSG(extack, "Invalid selector family"); return -EINVAL; } ret = verify_policy_dir(p->dir, extack); if (ret) return ret; if (p->index && (xfrm_policy_id2dir(p->index) != p->dir)) { NL_SET_ERR_MSG(extack, "Policy index doesn't match direction"); return -EINVAL; } return 0; } static int copy_from_user_sec_ctx(struct xfrm_policy *pol, struct nlattr **attrs) { struct nlattr *rt = attrs[XFRMA_SEC_CTX]; struct xfrm_user_sec_ctx *uctx; if (!rt) return 0; uctx = nla_data(rt); return security_xfrm_policy_alloc(&pol->security, uctx, GFP_KERNEL); } static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut, int nr) { int i; xp->xfrm_nr = nr; for (i = 0; i < nr; i++, ut++) { struct xfrm_tmpl *t = &xp->xfrm_vec[i]; memcpy(&t->id, &ut->id, sizeof(struct xfrm_id)); memcpy(&t->saddr, &ut->saddr, sizeof(xfrm_address_t)); t->reqid = ut->reqid; t->mode = ut->mode; t->share = ut->share; t->optional = ut->optional; t->aalgos = ut->aalgos; t->ealgos = ut->ealgos; t->calgos = ut->calgos; /* If all masks are ~0, then we allow all algorithms. */ t->allalgs = !~(t->aalgos & t->ealgos & t->calgos); t->encap_family = ut->family; } } static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family, int dir, struct netlink_ext_ack *extack) { u16 prev_family; int i; if (nr > XFRM_MAX_DEPTH) { NL_SET_ERR_MSG(extack, "Template count must be <= XFRM_MAX_DEPTH (" __stringify(XFRM_MAX_DEPTH) ")"); return -EINVAL; } prev_family = family; for (i = 0; i < nr; i++) { /* We never validated the ut->family value, so many * applications simply leave it at zero. The check was * never made and ut->family was ignored because all * templates could be assumed to have the same family as * the policy itself. Now that we will have ipv4-in-ipv6 * and ipv6-in-ipv4 tunnels, this is no longer true. */ if (!ut[i].family) ut[i].family = family; switch (ut[i].mode) { case XFRM_MODE_TUNNEL: case XFRM_MODE_BEET: if (ut[i].optional && dir == XFRM_POLICY_OUT) { NL_SET_ERR_MSG(extack, "Mode in optional template not allowed in outbound policy"); return -EINVAL; } break; default: if (ut[i].family != prev_family) { NL_SET_ERR_MSG(extack, "Mode in template doesn't support a family change"); return -EINVAL; } break; } if (ut[i].mode >= XFRM_MODE_MAX) { NL_SET_ERR_MSG(extack, "Mode in template must be < XFRM_MODE_MAX (" __stringify(XFRM_MODE_MAX) ")"); return -EINVAL; } prev_family = ut[i].family; switch (ut[i].family) { case AF_INET: break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: break; #endif default: NL_SET_ERR_MSG(extack, "Invalid family in template"); return -EINVAL; } if (!xfrm_id_proto_valid(ut[i].id.proto)) { NL_SET_ERR_MSG(extack, "Invalid XFRM protocol in template"); return -EINVAL; } } return 0; } static int copy_from_user_tmpl(struct xfrm_policy *pol, struct nlattr **attrs, int dir, struct netlink_ext_ack *extack) { struct nlattr *rt = attrs[XFRMA_TMPL]; if (!rt) { pol->xfrm_nr = 0; } else { struct xfrm_user_tmpl *utmpl = nla_data(rt); int nr = nla_len(rt) / sizeof(*utmpl); int err; err = validate_tmpl(nr, utmpl, pol->family, dir, extack); if (err) return err; copy_templates(pol, utmpl, nr); } return 0; } static int copy_from_user_policy_type(u8 *tp, struct nlattr **attrs, struct netlink_ext_ack *extack) { struct nlattr *rt = attrs[XFRMA_POLICY_TYPE]; struct xfrm_userpolicy_type *upt; u8 type = XFRM_POLICY_TYPE_MAIN; int err; if (rt) { upt = nla_data(rt); type = upt->type; } err = verify_policy_type(type, extack); if (err) return err; *tp = type; return 0; } static void copy_from_user_policy(struct xfrm_policy *xp, struct xfrm_userpolicy_info *p) { xp->priority = p->priority; xp->index = p->index; memcpy(&xp->selector, &p->sel, sizeof(xp->selector)); memcpy(&xp->lft, &p->lft, sizeof(xp->lft)); xp->action = p->action; xp->flags = p->flags; xp->family = p->sel.family; /* XXX xp->share = p->share; */ } static void copy_to_user_policy(struct xfrm_policy *xp, struct xfrm_userpolicy_info *p, int dir) { memset(p, 0, sizeof(*p)); memcpy(&p->sel, &xp->selector, sizeof(p->sel)); memcpy(&p->lft, &xp->lft, sizeof(p->lft)); memcpy(&p->curlft, &xp->curlft, sizeof(p->curlft)); p->priority = xp->priority; p->index = xp->index; p->sel.family = xp->family; p->dir = dir; p->action = xp->action; p->flags = xp->flags; p->share = XFRM_SHARE_ANY; /* XXX xp->share */ } static struct xfrm_policy *xfrm_policy_construct(struct net *net, struct xfrm_userpolicy_info *p, struct nlattr **attrs, int *errp, struct netlink_ext_ack *extack) { struct xfrm_policy *xp = xfrm_policy_alloc(net, GFP_KERNEL); int err; if (!xp) { *errp = -ENOMEM; return NULL; } copy_from_user_policy(xp, p); err = copy_from_user_policy_type(&xp->type, attrs, extack); if (err) goto error; if (!(err = copy_from_user_tmpl(xp, attrs, p->dir, extack))) err = copy_from_user_sec_ctx(xp, attrs); if (err) goto error; xfrm_mark_get(attrs, &xp->mark); if (attrs[XFRMA_IF_ID]) xp->if_id = nla_get_u32(attrs[XFRMA_IF_ID]); /* configure the hardware if offload is requested */ if (attrs[XFRMA_OFFLOAD_DEV]) { err = xfrm_dev_policy_add(net, xp, nla_data(attrs[XFRMA_OFFLOAD_DEV]), p->dir, extack); if (err) goto error; } return xp; error: *errp = err; xp->walk.dead = 1; xfrm_policy_destroy(xp); return NULL; } static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **attrs, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct xfrm_userpolicy_info *p = nlmsg_data(nlh); struct xfrm_policy *xp; struct km_event c; int err; int excl; err = verify_newpolicy_info(p, extack); if (err) return err; err = verify_sec_ctx_len(attrs, extack); if (err) return err; xp = xfrm_policy_construct(net, p, attrs, &err, extack); if (!xp) return err; /* shouldn't excl be based on nlh flags?? * Aha! this is anti-netlink really i.e more pfkey derived * in netlink excl is a flag and you wouldn't need * a type XFRM_MSG_UPDPOLICY - JHS */ excl = nlh->nlmsg_type == XFRM_MSG_NEWPOLICY; err = xfrm_policy_insert(p->dir, xp, excl); xfrm_audit_policy_add(xp, err ? 0 : 1, true); if (err) { xfrm_dev_policy_delete(xp); xfrm_dev_policy_free(xp); security_xfrm_policy_free(xp->security); kfree(xp); return err; } c.event = nlh->nlmsg_type; c.seq = nlh->nlmsg_seq; c.portid = nlh->nlmsg_pid; km_policy_notify(xp, p->dir, &c); xfrm_pol_put(xp); return 0; } static int copy_to_user_tmpl(struct xfrm_policy *xp, struct sk_buff *skb) { struct xfrm_user_tmpl vec[XFRM_MAX_DEPTH]; int i; if (xp->xfrm_nr == 0) return 0; for (i = 0; i < xp->xfrm_nr; i++) { struct xfrm_user_tmpl *up = &vec[i]; struct xfrm_tmpl *kp = &xp->xfrm_vec[i]; memset(up, 0, sizeof(*up)); memcpy(&up->id, &kp->id, sizeof(up->id)); up->family = kp->encap_family; memcpy(&up->saddr, &kp->saddr, sizeof(up->saddr)); up->reqid = kp->reqid; up->mode = kp->mode; up->share = kp->share; up->optional = kp->optional; up->aalgos = kp->aalgos; up->ealgos = kp->ealgos; up->calgos = kp->calgos; } return nla_put(skb, XFRMA_TMPL, sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr, vec); } static inline int copy_to_user_state_sec_ctx(struct xfrm_state *x, struct sk_buff *skb) { if (x->security) { return copy_sec_ctx(x->security, skb); } return 0; } static inline int copy_to_user_sec_ctx(struct xfrm_policy *xp, struct sk_buff *skb) { if (xp->security) return copy_sec_ctx(xp->security, skb); return 0; } static inline unsigned int userpolicy_type_attrsize(void) { #ifdef CONFIG_XFRM_SUB_POLICY return nla_total_size(sizeof(struct xfrm_userpolicy_type)); #else return 0; #endif } #ifdef CONFIG_XFRM_SUB_POLICY static int copy_to_user_policy_type(u8 type, struct sk_buff *skb) { struct xfrm_userpolicy_type upt; /* Sadly there are two holes in struct xfrm_userpolicy_type */ memset(&upt, 0, sizeof(upt)); upt.type = type; return nla_put(skb, XFRMA_POLICY_TYPE, sizeof(upt), &upt); } #else static inline int copy_to_user_policy_type(u8 type, struct sk_buff *skb) { return 0; } #endif static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr) { struct xfrm_dump_info *sp = ptr; struct xfrm_userpolicy_info *p; struct sk_buff *in_skb = sp->in_skb; struct sk_buff *skb = sp->out_skb; struct xfrm_translator *xtr; struct nlmsghdr *nlh; int err; nlh = nlmsg_put(skb, NETLINK_CB(in_skb).portid, sp->nlmsg_seq, XFRM_MSG_NEWPOLICY, sizeof(*p), sp->nlmsg_flags); if (nlh == NULL) return -EMSGSIZE; p = nlmsg_data(nlh); copy_to_user_policy(xp, p, dir); err = copy_to_user_tmpl(xp, skb); if (!err) err = copy_to_user_sec_ctx(xp, skb); if (!err) err = copy_to_user_policy_type(xp->type, skb); if (!err) err = xfrm_mark_put(skb, &xp->mark); if (!err) err = xfrm_if_id_put(skb, xp->if_id); if (!err && xp->xdo.dev) err = copy_user_offload(&xp->xdo, skb); if (err) { nlmsg_cancel(skb, nlh); return err; } nlmsg_end(skb, nlh); xtr = xfrm_get_translator(); if (xtr) { err = xtr->alloc_compat(skb, nlh); xfrm_put_translator(xtr); if (err) { nlmsg_cancel(skb, nlh); return err; } } return 0; } static int xfrm_dump_policy_done(struct netlink_callback *cb) { struct xfrm_policy_walk *walk = (struct xfrm_policy_walk *)cb->args; struct net *net = sock_net(cb->skb->sk); xfrm_policy_walk_done(walk, net); return 0; } static int xfrm_dump_policy_start(struct netlink_callback *cb) { struct xfrm_policy_walk *walk = (struct xfrm_policy_walk *)cb->args; BUILD_BUG_ON(sizeof(*walk) > sizeof(cb->args)); xfrm_policy_walk_init(walk, XFRM_POLICY_TYPE_ANY); return 0; } static int xfrm_dump_policy(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct xfrm_policy_walk *walk = (struct xfrm_policy_walk *)cb->args; struct xfrm_dump_info info; info.in_skb = cb->skb; info.out_skb = skb; info.nlmsg_seq = cb->nlh->nlmsg_seq; info.nlmsg_flags = NLM_F_MULTI; (void) xfrm_policy_walk(net, walk, dump_one_policy, &info); return skb->len; } static struct sk_buff *xfrm_policy_netlink(struct sk_buff *in_skb, struct xfrm_policy *xp, int dir, u32 seq) { struct xfrm_dump_info info; struct sk_buff *skb; int err; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return ERR_PTR(-ENOMEM); info.in_skb = in_skb; info.out_skb = skb; info.nlmsg_seq = seq; info.nlmsg_flags = 0; err = dump_one_policy(xp, dir, 0, &info); if (err) { kfree_skb(skb); return ERR_PTR(err); } return skb; } static int xfrm_notify_userpolicy(struct net *net) { struct xfrm_userpolicy_default *up; int len = NLMSG_ALIGN(sizeof(*up)); struct nlmsghdr *nlh; struct sk_buff *skb; int err; skb = nlmsg_new(len, GFP_ATOMIC); if (skb == NULL) return -ENOMEM; nlh = nlmsg_put(skb, 0, 0, XFRM_MSG_GETDEFAULT, sizeof(*up), 0); if (nlh == NULL) { kfree_skb(skb); return -EMSGSIZE; } up = nlmsg_data(nlh); up->in = net->xfrm.policy_default[XFRM_POLICY_IN]; up->fwd = net->xfrm.policy_default[XFRM_POLICY_FWD]; up->out = net->xfrm.policy_default[XFRM_POLICY_OUT]; nlmsg_end(skb, nlh); rcu_read_lock(); err = xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_POLICY); rcu_read_unlock(); return err; } static bool xfrm_userpolicy_is_valid(__u8 policy) { return policy == XFRM_USERPOLICY_BLOCK || policy == XFRM_USERPOLICY_ACCEPT; } static int xfrm_set_default(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **attrs, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct xfrm_userpolicy_default *up = nlmsg_data(nlh); if (xfrm_userpolicy_is_valid(up->in)) net->xfrm.policy_default[XFRM_POLICY_IN] = up->in; if (xfrm_userpolicy_is_valid(up->fwd)) net->xfrm.policy_default[XFRM_POLICY_FWD] = up->fwd; if (xfrm_userpolicy_is_valid(up->out)) net->xfrm.policy_default[XFRM_POLICY_OUT] = up->out; rt_genid_bump_all(net); xfrm_notify_userpolicy(net); return 0; } static int xfrm_get_default(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **attrs, struct netlink_ext_ack *extack) { struct sk_buff *r_skb; struct nlmsghdr *r_nlh; struct net *net = sock_net(skb->sk); struct xfrm_userpolicy_default *r_up; int len = NLMSG_ALIGN(sizeof(struct xfrm_userpolicy_default)); u32 portid = NETLINK_CB(skb).portid; u32 seq = nlh->nlmsg_seq; r_skb = nlmsg_new(len, GFP_ATOMIC); if (!r_skb) return -ENOMEM; r_nlh = nlmsg_put(r_skb, portid, seq, XFRM_MSG_GETDEFAULT, sizeof(*r_up), 0); if (!r_nlh) { kfree_skb(r_skb); return -EMSGSIZE; } r_up = nlmsg_data(r_nlh); r_up->in = net->xfrm.policy_default[XFRM_POLICY_IN]; r_up->fwd = net->xfrm.policy_default[XFRM_POLICY_FWD]; r_up->out = net->xfrm.policy_default[XFRM_POLICY_OUT]; nlmsg_end(r_skb, r_nlh); return nlmsg_unicast(net->xfrm.nlsk, r_skb, portid); } static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **attrs, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct xfrm_policy *xp; struct xfrm_userpolicy_id *p; u8 type = XFRM_POLICY_TYPE_MAIN; int err; struct km_event c; int delete; struct xfrm_mark m; u32 if_id = 0; p = nlmsg_data(nlh); delete = nlh->nlmsg_type == XFRM_MSG_DELPOLICY; err = copy_from_user_policy_type(&type, attrs, extack); if (err) return err; err = verify_policy_dir(p->dir, extack); if (err) return err; if (attrs[XFRMA_IF_ID]) if_id = nla_get_u32(attrs[XFRMA_IF_ID]); xfrm_mark_get(attrs, &m); if (p->index) xp = xfrm_policy_byid(net, &m, if_id, type, p->dir, p->index, delete, &err); else { struct nlattr *rt = attrs[XFRMA_SEC_CTX]; struct xfrm_sec_ctx *ctx; err = verify_sec_ctx_len(attrs, extack); if (err) return err; ctx = NULL; if (rt) { struct xfrm_user_sec_ctx *uctx = nla_data(rt); err = security_xfrm_policy_alloc(&ctx, uctx, GFP_KERNEL); if (err) return err; } xp = xfrm_policy_bysel_ctx(net, &m, if_id, type, p->dir, &p->sel, ctx, delete, &err); security_xfrm_policy_free(ctx); } if (xp == NULL) return -ENOENT; if (!delete) { struct sk_buff *resp_skb; resp_skb = xfrm_policy_netlink(skb, xp, p->dir, nlh->nlmsg_seq); if (IS_ERR(resp_skb)) { err = PTR_ERR(resp_skb); } else { err = nlmsg_unicast(net->xfrm.nlsk, resp_skb, NETLINK_CB(skb).portid); } } else { xfrm_dev_policy_delete(xp); xfrm_audit_policy_delete(xp, err ? 0 : 1, true); if (err != 0) goto out; c.data.byid = p->index; c.event = nlh->nlmsg_type; c.seq = nlh->nlmsg_seq; c.portid = nlh->nlmsg_pid; km_policy_notify(xp, p->dir, &c); } out: xfrm_pol_put(xp); return err; } static int xfrm_flush_sa(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **attrs, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct km_event c; struct xfrm_usersa_flush *p = nlmsg_data(nlh); int err; err = xfrm_state_flush(net, p->proto, true, false); if (err) { if (err == -ESRCH) /* empty table */ return 0; return err; } c.data.proto = p->proto; c.event = nlh->nlmsg_type; c.seq = nlh->nlmsg_seq; c.portid = nlh->nlmsg_pid; c.net = net; km_state_notify(NULL, &c); return 0; } static inline unsigned int xfrm_aevent_msgsize(struct xfrm_state *x) { unsigned int replay_size = x->replay_esn ? xfrm_replay_state_esn_len(x->replay_esn) : sizeof(struct xfrm_replay_state); return NLMSG_ALIGN(sizeof(struct xfrm_aevent_id)) + nla_total_size(replay_size) + nla_total_size_64bit(sizeof(struct xfrm_lifetime_cur)) + nla_total_size(sizeof(struct xfrm_mark)) + nla_total_size(4) /* XFRM_AE_RTHR */ + nla_total_size(4); /* XFRM_AE_ETHR */ } static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, const struct km_event *c) { struct xfrm_aevent_id *id; struct nlmsghdr *nlh; int err; nlh = nlmsg_put(skb, c->portid, c->seq, XFRM_MSG_NEWAE, sizeof(*id), 0); if (nlh == NULL) return -EMSGSIZE; id = nlmsg_data(nlh); memset(&id->sa_id, 0, sizeof(id->sa_id)); memcpy(&id->sa_id.daddr, &x->id.daddr, sizeof(x->id.daddr)); id->sa_id.spi = x->id.spi; id->sa_id.family = x->props.family; id->sa_id.proto = x->id.proto; memcpy(&id->saddr, &x->props.saddr, sizeof(x->props.saddr)); id->reqid = x->props.reqid; id->flags = c->data.aevent; if (x->replay_esn) { err = nla_put(skb, XFRMA_REPLAY_ESN_VAL, xfrm_replay_state_esn_len(x->replay_esn), x->replay_esn); } else { err = nla_put(skb, XFRMA_REPLAY_VAL, sizeof(x->replay), &x->replay); } if (err) goto out_cancel; err = nla_put_64bit(skb, XFRMA_LTIME_VAL, sizeof(x->curlft), &x->curlft, XFRMA_PAD); if (err) goto out_cancel; if (id->flags & XFRM_AE_RTHR) { err = nla_put_u32(skb, XFRMA_REPLAY_THRESH, x->replay_maxdiff); if (err) goto out_cancel; } if (id->flags & XFRM_AE_ETHR) { err = nla_put_u32(skb, XFRMA_ETIMER_THRESH, x->replay_maxage * 10 / HZ); if (err) goto out_cancel; } err = xfrm_mark_put(skb, &x->mark); if (err) goto out_cancel; err = xfrm_if_id_put(skb, x->if_id); if (err) goto out_cancel; nlmsg_end(skb, nlh); return 0; out_cancel: nlmsg_cancel(skb, nlh); return err; } static int xfrm_get_ae(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **attrs, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct xfrm_state *x; struct sk_buff *r_skb; int err; struct km_event c; u32 mark; struct xfrm_mark m; struct xfrm_aevent_id *p = nlmsg_data(nlh); struct xfrm_usersa_id *id = &p->sa_id; mark = xfrm_mark_get(attrs, &m); x = xfrm_state_lookup(net, mark, &id->daddr, id->spi, id->proto, id->family); if (x == NULL) return -ESRCH; r_skb = nlmsg_new(xfrm_aevent_msgsize(x), GFP_ATOMIC); if (r_skb == NULL) { xfrm_state_put(x); return -ENOMEM; } /* * XXX: is this lock really needed - none of the other * gets lock (the concern is things getting updated * while we are still reading) - jhs */ spin_lock_bh(&x->lock); c.data.aevent = p->flags; c.seq = nlh->nlmsg_seq; c.portid = nlh->nlmsg_pid; err = build_aevent(r_skb, x, &c); BUG_ON(err < 0); err = nlmsg_unicast(net->xfrm.nlsk, r_skb, NETLINK_CB(skb).portid); spin_unlock_bh(&x->lock); xfrm_state_put(x); return err; } static int xfrm_new_ae(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **attrs, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct xfrm_state *x; struct km_event c; int err = -EINVAL; u32 mark = 0; struct xfrm_mark m; struct xfrm_aevent_id *p = nlmsg_data(nlh); struct nlattr *rp = attrs[XFRMA_REPLAY_VAL]; struct nlattr *re = attrs[XFRMA_REPLAY_ESN_VAL]; struct nlattr *lt = attrs[XFRMA_LTIME_VAL]; struct nlattr *et = attrs[XFRMA_ETIMER_THRESH]; struct nlattr *rt = attrs[XFRMA_REPLAY_THRESH]; if (!lt && !rp && !re && !et && !rt) { NL_SET_ERR_MSG(extack, "Missing required attribute for AE"); return err; } /* pedantic mode - thou shalt sayeth replaceth */ if (!(nlh->nlmsg_flags & NLM_F_REPLACE)) { NL_SET_ERR_MSG(extack, "NLM_F_REPLACE flag is required"); return err; } mark = xfrm_mark_get(attrs, &m); x = xfrm_state_lookup(net, mark, &p->sa_id.daddr, p->sa_id.spi, p->sa_id.proto, p->sa_id.family); if (x == NULL) return -ESRCH; if (x->km.state != XFRM_STATE_VALID) { NL_SET_ERR_MSG(extack, "SA must be in VALID state"); goto out; } err = xfrm_replay_verify_len(x->replay_esn, re, extack); if (err) goto out; spin_lock_bh(&x->lock); xfrm_update_ae_params(x, attrs, 1); spin_unlock_bh(&x->lock); c.event = nlh->nlmsg_type; c.seq = nlh->nlmsg_seq; c.portid = nlh->nlmsg_pid; c.data.aevent = XFRM_AE_CU; km_state_notify(x, &c); err = 0; out: xfrm_state_put(x); return err; } static int xfrm_flush_policy(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **attrs, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct km_event c; u8 type = XFRM_POLICY_TYPE_MAIN; int err; err = copy_from_user_policy_type(&type, attrs, extack); if (err) return err; err = xfrm_policy_flush(net, type, true); if (err) { if (err == -ESRCH) /* empty table */ return 0; return err; } c.data.type = type; c.event = nlh->nlmsg_type; c.seq = nlh->nlmsg_seq; c.portid = nlh->nlmsg_pid; c.net = net; km_policy_notify(NULL, 0, &c); return 0; } static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **attrs, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct xfrm_policy *xp; struct xfrm_user_polexpire *up = nlmsg_data(nlh); struct xfrm_userpolicy_info *p = &up->pol; u8 type = XFRM_POLICY_TYPE_MAIN; int err = -ENOENT; struct xfrm_mark m; u32 if_id = 0; err = copy_from_user_policy_type(&type, attrs, extack); if (err) return err; err = verify_policy_dir(p->dir, extack); if (err) return err; if (attrs[XFRMA_IF_ID]) if_id = nla_get_u32(attrs[XFRMA_IF_ID]); xfrm_mark_get(attrs, &m); if (p->index) xp = xfrm_policy_byid(net, &m, if_id, type, p->dir, p->index, 0, &err); else { struct nlattr *rt = attrs[XFRMA_SEC_CTX]; struct xfrm_sec_ctx *ctx; err = verify_sec_ctx_len(attrs, extack); if (err) return err; ctx = NULL; if (rt) { struct xfrm_user_sec_ctx *uctx = nla_data(rt); err = security_xfrm_policy_alloc(&ctx, uctx, GFP_KERNEL); if (err) return err; } xp = xfrm_policy_bysel_ctx(net, &m, if_id, type, p->dir, &p->sel, ctx, 0, &err); security_xfrm_policy_free(ctx); } if (xp == NULL) return -ENOENT; if (unlikely(xp->walk.dead)) goto out; err = 0; if (up->hard) { xfrm_policy_delete(xp, p->dir); xfrm_audit_policy_delete(xp, 1, true); } km_policy_expired(xp, p->dir, up->hard, nlh->nlmsg_pid); out: xfrm_pol_put(xp); return err; } static int xfrm_add_sa_expire(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **attrs, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct xfrm_state *x; int err; struct xfrm_user_expire *ue = nlmsg_data(nlh); struct xfrm_usersa_info *p = &ue->state; struct xfrm_mark m; u32 mark = xfrm_mark_get(attrs, &m); x = xfrm_state_lookup(net, mark, &p->id.daddr, p->id.spi, p->id.proto, p->family); err = -ENOENT; if (x == NULL) return err; spin_lock_bh(&x->lock); err = -EINVAL; if (x->km.state != XFRM_STATE_VALID) { NL_SET_ERR_MSG(extack, "SA must be in VALID state"); goto out; } km_state_expired(x, ue->hard, nlh->nlmsg_pid); if (ue->hard) { __xfrm_state_delete(x); xfrm_audit_state_delete(x, 1, true); } err = 0; out: spin_unlock_bh(&x->lock); xfrm_state_put(x); return err; } static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **attrs, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct xfrm_policy *xp; struct xfrm_user_tmpl *ut; int i; struct nlattr *rt = attrs[XFRMA_TMPL]; struct xfrm_mark mark; struct xfrm_user_acquire *ua = nlmsg_data(nlh); struct xfrm_state *x = xfrm_state_alloc(net); int err = -ENOMEM; if (!x) goto nomem; xfrm_mark_get(attrs, &mark); err = verify_newpolicy_info(&ua->policy, extack); if (err) goto free_state; err = verify_sec_ctx_len(attrs, extack); if (err) goto free_state; /* build an XP */ xp = xfrm_policy_construct(net, &ua->policy, attrs, &err, extack); if (!xp) goto free_state; memcpy(&x->id, &ua->id, sizeof(ua->id)); memcpy(&x->props.saddr, &ua->saddr, sizeof(ua->saddr)); memcpy(&x->sel, &ua->sel, sizeof(ua->sel)); xp->mark.m = x->mark.m = mark.m; xp->mark.v = x->mark.v = mark.v; ut = nla_data(rt); /* extract the templates and for each call km_key */ for (i = 0; i < xp->xfrm_nr; i++, ut++) { struct xfrm_tmpl *t = &xp->xfrm_vec[i]; memcpy(&x->id, &t->id, sizeof(x->id)); x->props.mode = t->mode; x->props.reqid = t->reqid; x->props.family = ut->family; t->aalgos = ua->aalgos; t->ealgos = ua->ealgos; t->calgos = ua->calgos; err = km_query(x, t, xp); } xfrm_state_free(x); kfree(xp); return 0; free_state: xfrm_state_free(x); nomem: return err; } #ifdef CONFIG_XFRM_MIGRATE static int copy_from_user_migrate(struct xfrm_migrate *ma, struct xfrm_kmaddress *k, struct nlattr **attrs, int *num, struct netlink_ext_ack *extack) { struct nlattr *rt = attrs[XFRMA_MIGRATE]; struct xfrm_user_migrate *um; int i, num_migrate; if (k != NULL) { struct xfrm_user_kmaddress *uk; uk = nla_data(attrs[XFRMA_KMADDRESS]); memcpy(&k->local, &uk->local, sizeof(k->local)); memcpy(&k->remote, &uk->remote, sizeof(k->remote)); k->family = uk->family; k->reserved = uk->reserved; } um = nla_data(rt); num_migrate = nla_len(rt) / sizeof(*um); if (num_migrate <= 0 || num_migrate > XFRM_MAX_DEPTH) { NL_SET_ERR_MSG(extack, "Invalid number of SAs to migrate, must be 0 < num <= XFRM_MAX_DEPTH (6)"); return -EINVAL; } for (i = 0; i < num_migrate; i++, um++, ma++) { memcpy(&ma->old_daddr, &um->old_daddr, sizeof(ma->old_daddr)); memcpy(&ma->old_saddr, &um->old_saddr, sizeof(ma->old_saddr)); memcpy(&ma->new_daddr, &um->new_daddr, sizeof(ma->new_daddr)); memcpy(&ma->new_saddr, &um->new_saddr, sizeof(ma->new_saddr)); ma->proto = um->proto; ma->mode = um->mode; ma->reqid = um->reqid; ma->old_family = um->old_family; ma->new_family = um->new_family; } *num = i; return 0; } static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **attrs, struct netlink_ext_ack *extack) { struct xfrm_userpolicy_id *pi = nlmsg_data(nlh); struct xfrm_migrate m[XFRM_MAX_DEPTH]; struct xfrm_kmaddress km, *kmp; u8 type; int err; int n = 0; struct net *net = sock_net(skb->sk); struct xfrm_encap_tmpl *encap = NULL; u32 if_id = 0; if (!attrs[XFRMA_MIGRATE]) { NL_SET_ERR_MSG(extack, "Missing required MIGRATE attribute"); return -EINVAL; } kmp = attrs[XFRMA_KMADDRESS] ? &km : NULL; err = copy_from_user_policy_type(&type, attrs, extack); if (err) return err; err = copy_from_user_migrate(m, kmp, attrs, &n, extack); if (err) return err; if (!n) return 0; if (attrs[XFRMA_ENCAP]) { encap = kmemdup(nla_data(attrs[XFRMA_ENCAP]), sizeof(*encap), GFP_KERNEL); if (!encap) return -ENOMEM; } if (attrs[XFRMA_IF_ID]) if_id = nla_get_u32(attrs[XFRMA_IF_ID]); err = xfrm_migrate(&pi->sel, pi->dir, type, m, n, kmp, net, encap, if_id, extack); kfree(encap); return err; } #else static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **attrs, struct netlink_ext_ack *extack) { return -ENOPROTOOPT; } #endif #ifdef CONFIG_XFRM_MIGRATE static int copy_to_user_migrate(const struct xfrm_migrate *m, struct sk_buff *skb) { struct xfrm_user_migrate um; memset(&um, 0, sizeof(um)); um.proto = m->proto; um.mode = m->mode; um.reqid = m->reqid; um.old_family = m->old_family; memcpy(&um.old_daddr, &m->old_daddr, sizeof(um.old_daddr)); memcpy(&um.old_saddr, &m->old_saddr, sizeof(um.old_saddr)); um.new_family = m->new_family; memcpy(&um.new_daddr, &m->new_daddr, sizeof(um.new_daddr)); memcpy(&um.new_saddr, &m->new_saddr, sizeof(um.new_saddr)); return nla_put(skb, XFRMA_MIGRATE, sizeof(um), &um); } static int copy_to_user_kmaddress(const struct xfrm_kmaddress *k, struct sk_buff *skb) { struct xfrm_user_kmaddress uk; memset(&uk, 0, sizeof(uk)); uk.family = k->family; uk.reserved = k->reserved; memcpy(&uk.local, &k->local, sizeof(uk.local)); memcpy(&uk.remote, &k->remote, sizeof(uk.remote)); return nla_put(skb, XFRMA_KMADDRESS, sizeof(uk), &uk); } static inline unsigned int xfrm_migrate_msgsize(int num_migrate, int with_kma, int with_encp) { return NLMSG_ALIGN(sizeof(struct xfrm_userpolicy_id)) + (with_kma ? nla_total_size(sizeof(struct xfrm_kmaddress)) : 0) + (with_encp ? nla_total_size(sizeof(struct xfrm_encap_tmpl)) : 0) + nla_total_size(sizeof(struct xfrm_user_migrate) * num_migrate) + userpolicy_type_attrsize(); } static int build_migrate(struct sk_buff *skb, const struct xfrm_migrate *m, int num_migrate, const struct xfrm_kmaddress *k, const struct xfrm_selector *sel, const struct xfrm_encap_tmpl *encap, u8 dir, u8 type) { const struct xfrm_migrate *mp; struct xfrm_userpolicy_id *pol_id; struct nlmsghdr *nlh; int i, err; nlh = nlmsg_put(skb, 0, 0, XFRM_MSG_MIGRATE, sizeof(*pol_id), 0); if (nlh == NULL) return -EMSGSIZE; pol_id = nlmsg_data(nlh); /* copy data from selector, dir, and type to the pol_id */ memset(pol_id, 0, sizeof(*pol_id)); memcpy(&pol_id->sel, sel, sizeof(pol_id->sel)); pol_id->dir = dir; if (k != NULL) { err = copy_to_user_kmaddress(k, skb); if (err) goto out_cancel; } if (encap) { err = nla_put(skb, XFRMA_ENCAP, sizeof(*encap), encap); if (err) goto out_cancel; } err = copy_to_user_policy_type(type, skb); if (err) goto out_cancel; for (i = 0, mp = m ; i < num_migrate; i++, mp++) { err = copy_to_user_migrate(mp, skb); if (err) goto out_cancel; } nlmsg_end(skb, nlh); return 0; out_cancel: nlmsg_cancel(skb, nlh); return err; } static int xfrm_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_migrate, const struct xfrm_kmaddress *k, const struct xfrm_encap_tmpl *encap) { struct net *net = &init_net; struct sk_buff *skb; int err; skb = nlmsg_new(xfrm_migrate_msgsize(num_migrate, !!k, !!encap), GFP_ATOMIC); if (skb == NULL) return -ENOMEM; /* build migrate */ err = build_migrate(skb, m, num_migrate, k, sel, encap, dir, type); BUG_ON(err < 0); return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_MIGRATE); } #else static int xfrm_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_migrate, const struct xfrm_kmaddress *k, const struct xfrm_encap_tmpl *encap) { return -ENOPROTOOPT; } #endif #define XMSGSIZE(type) sizeof(struct type) const int xfrm_msg_min[XFRM_NR_MSGTYPES] = { [XFRM_MSG_NEWSA - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_info), [XFRM_MSG_DELSA - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_id), [XFRM_MSG_GETSA - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_id), [XFRM_MSG_NEWPOLICY - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_info), [XFRM_MSG_DELPOLICY - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id), [XFRM_MSG_GETPOLICY - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id), [XFRM_MSG_ALLOCSPI - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userspi_info), [XFRM_MSG_ACQUIRE - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_acquire), [XFRM_MSG_EXPIRE - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_expire), [XFRM_MSG_UPDPOLICY - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_info), [XFRM_MSG_UPDSA - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_info), [XFRM_MSG_POLEXPIRE - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_polexpire), [XFRM_MSG_FLUSHSA - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_flush), [XFRM_MSG_FLUSHPOLICY - XFRM_MSG_BASE] = 0, [XFRM_MSG_NEWAE - XFRM_MSG_BASE] = XMSGSIZE(xfrm_aevent_id), [XFRM_MSG_GETAE - XFRM_MSG_BASE] = XMSGSIZE(xfrm_aevent_id), [XFRM_MSG_REPORT - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_report), [XFRM_MSG_MIGRATE - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id), [XFRM_MSG_GETSADINFO - XFRM_MSG_BASE] = sizeof(u32), [XFRM_MSG_NEWSPDINFO - XFRM_MSG_BASE] = sizeof(u32), [XFRM_MSG_GETSPDINFO - XFRM_MSG_BASE] = sizeof(u32), [XFRM_MSG_SETDEFAULT - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_default), [XFRM_MSG_GETDEFAULT - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_default), }; EXPORT_SYMBOL_GPL(xfrm_msg_min); #undef XMSGSIZE const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { [XFRMA_SA] = { .len = sizeof(struct xfrm_usersa_info)}, [XFRMA_POLICY] = { .len = sizeof(struct xfrm_userpolicy_info)}, [XFRMA_LASTUSED] = { .type = NLA_U64}, [XFRMA_ALG_AUTH_TRUNC] = { .len = sizeof(struct xfrm_algo_auth)}, [XFRMA_ALG_AEAD] = { .len = sizeof(struct xfrm_algo_aead) }, [XFRMA_ALG_AUTH] = { .len = sizeof(struct xfrm_algo) }, [XFRMA_ALG_CRYPT] = { .len = sizeof(struct xfrm_algo) }, [XFRMA_ALG_COMP] = { .len = sizeof(struct xfrm_algo) }, [XFRMA_ENCAP] = { .len = sizeof(struct xfrm_encap_tmpl) }, [XFRMA_TMPL] = { .len = sizeof(struct xfrm_user_tmpl) }, [XFRMA_SEC_CTX] = { .len = sizeof(struct xfrm_user_sec_ctx) }, [XFRMA_LTIME_VAL] = { .len = sizeof(struct xfrm_lifetime_cur) }, [XFRMA_REPLAY_VAL] = { .len = sizeof(struct xfrm_replay_state) }, [XFRMA_REPLAY_THRESH] = { .type = NLA_U32 }, [XFRMA_ETIMER_THRESH] = { .type = NLA_U32 }, [XFRMA_SRCADDR] = { .len = sizeof(xfrm_address_t) }, [XFRMA_COADDR] = { .len = sizeof(xfrm_address_t) }, [XFRMA_POLICY_TYPE] = { .len = sizeof(struct xfrm_userpolicy_type)}, [XFRMA_MIGRATE] = { .len = sizeof(struct xfrm_user_migrate) }, [XFRMA_KMADDRESS] = { .len = sizeof(struct xfrm_user_kmaddress) }, [XFRMA_MARK] = { .len = sizeof(struct xfrm_mark) }, [XFRMA_TFCPAD] = { .type = NLA_U32 }, [XFRMA_REPLAY_ESN_VAL] = { .len = sizeof(struct xfrm_replay_state_esn) }, [XFRMA_SA_EXTRA_FLAGS] = { .type = NLA_U32 }, [XFRMA_PROTO] = { .type = NLA_U8 }, [XFRMA_ADDRESS_FILTER] = { .len = sizeof(struct xfrm_address_filter) }, [XFRMA_OFFLOAD_DEV] = { .len = sizeof(struct xfrm_user_offload) }, [XFRMA_SET_MARK] = { .type = NLA_U32 }, [XFRMA_SET_MARK_MASK] = { .type = NLA_U32 }, [XFRMA_IF_ID] = { .type = NLA_U32 }, [XFRMA_MTIMER_THRESH] = { .type = NLA_U32 }, }; EXPORT_SYMBOL_GPL(xfrma_policy); static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = { [XFRMA_SPD_IPV4_HTHRESH] = { .len = sizeof(struct xfrmu_spdhthresh) }, [XFRMA_SPD_IPV6_HTHRESH] = { .len = sizeof(struct xfrmu_spdhthresh) }, }; static const struct xfrm_link { int (*doit)(struct sk_buff *, struct nlmsghdr *, struct nlattr **, struct netlink_ext_ack *); int (*start)(struct netlink_callback *); int (*dump)(struct sk_buff *, struct netlink_callback *); int (*done)(struct netlink_callback *); const struct nla_policy *nla_pol; int nla_max; } xfrm_dispatch[XFRM_NR_MSGTYPES] = { [XFRM_MSG_NEWSA - XFRM_MSG_BASE] = { .doit = xfrm_add_sa }, [XFRM_MSG_DELSA - XFRM_MSG_BASE] = { .doit = xfrm_del_sa }, [XFRM_MSG_GETSA - XFRM_MSG_BASE] = { .doit = xfrm_get_sa, .dump = xfrm_dump_sa, .done = xfrm_dump_sa_done }, [XFRM_MSG_NEWPOLICY - XFRM_MSG_BASE] = { .doit = xfrm_add_policy }, [XFRM_MSG_DELPOLICY - XFRM_MSG_BASE] = { .doit = xfrm_get_policy }, [XFRM_MSG_GETPOLICY - XFRM_MSG_BASE] = { .doit = xfrm_get_policy, .start = xfrm_dump_policy_start, .dump = xfrm_dump_policy, .done = xfrm_dump_policy_done }, [XFRM_MSG_ALLOCSPI - XFRM_MSG_BASE] = { .doit = xfrm_alloc_userspi }, [XFRM_MSG_ACQUIRE - XFRM_MSG_BASE] = { .doit = xfrm_add_acquire }, [XFRM_MSG_EXPIRE - XFRM_MSG_BASE] = { .doit = xfrm_add_sa_expire }, [XFRM_MSG_UPDPOLICY - XFRM_MSG_BASE] = { .doit = xfrm_add_policy }, [XFRM_MSG_UPDSA - XFRM_MSG_BASE] = { .doit = xfrm_add_sa }, [XFRM_MSG_POLEXPIRE - XFRM_MSG_BASE] = { .doit = xfrm_add_pol_expire}, [XFRM_MSG_FLUSHSA - XFRM_MSG_BASE] = { .doit = xfrm_flush_sa }, [XFRM_MSG_FLUSHPOLICY - XFRM_MSG_BASE] = { .doit = xfrm_flush_policy }, [XFRM_MSG_NEWAE - XFRM_MSG_BASE] = { .doit = xfrm_new_ae }, [XFRM_MSG_GETAE - XFRM_MSG_BASE] = { .doit = xfrm_get_ae }, [XFRM_MSG_MIGRATE - XFRM_MSG_BASE] = { .doit = xfrm_do_migrate }, [XFRM_MSG_GETSADINFO - XFRM_MSG_BASE] = { .doit = xfrm_get_sadinfo }, [XFRM_MSG_NEWSPDINFO - XFRM_MSG_BASE] = { .doit = xfrm_set_spdinfo, .nla_pol = xfrma_spd_policy, .nla_max = XFRMA_SPD_MAX }, [XFRM_MSG_GETSPDINFO - XFRM_MSG_BASE] = { .doit = xfrm_get_spdinfo }, [XFRM_MSG_SETDEFAULT - XFRM_MSG_BASE] = { .doit = xfrm_set_default }, [XFRM_MSG_GETDEFAULT - XFRM_MSG_BASE] = { .doit = xfrm_get_default }, }; static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *attrs[XFRMA_MAX+1]; const struct xfrm_link *link; struct nlmsghdr *nlh64 = NULL; int type, err; type = nlh->nlmsg_type; if (type > XFRM_MSG_MAX) return -EINVAL; type -= XFRM_MSG_BASE; link = &xfrm_dispatch[type]; /* All operations require privileges, even GET */ if (!netlink_net_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (in_compat_syscall()) { struct xfrm_translator *xtr = xfrm_get_translator(); if (!xtr) return -EOPNOTSUPP; nlh64 = xtr->rcv_msg_compat(nlh, link->nla_max, link->nla_pol, extack); xfrm_put_translator(xtr); if (IS_ERR(nlh64)) return PTR_ERR(nlh64); if (nlh64) nlh = nlh64; } if ((type == (XFRM_MSG_GETSA - XFRM_MSG_BASE) || type == (XFRM_MSG_GETPOLICY - XFRM_MSG_BASE)) && (nlh->nlmsg_flags & NLM_F_DUMP)) { struct netlink_dump_control c = { .start = link->start, .dump = link->dump, .done = link->done, }; if (link->dump == NULL) { err = -EINVAL; goto err; } err = netlink_dump_start(net->xfrm.nlsk, skb, nlh, &c); goto err; } err = nlmsg_parse_deprecated(nlh, xfrm_msg_min[type], attrs, link->nla_max ? : XFRMA_MAX, link->nla_pol ? : xfrma_policy, extack); if (err < 0) goto err; if (link->doit == NULL) { err = -EINVAL; goto err; } err = link->doit(skb, nlh, attrs, extack); /* We need to free skb allocated in xfrm_alloc_compat() before * returning from this function, because consume_skb() won't take * care of frag_list since netlink destructor sets * sbk->head to NULL. (see netlink_skb_destructor()) */ if (skb_has_frag_list(skb)) { kfree_skb(skb_shinfo(skb)->frag_list); skb_shinfo(skb)->frag_list = NULL; } err: kvfree(nlh64); return err; } static void xfrm_netlink_rcv(struct sk_buff *skb) { struct net *net = sock_net(skb->sk); mutex_lock(&net->xfrm.xfrm_cfg_mutex); netlink_rcv_skb(skb, &xfrm_user_rcv_msg); mutex_unlock(&net->xfrm.xfrm_cfg_mutex); } static inline unsigned int xfrm_expire_msgsize(void) { return NLMSG_ALIGN(sizeof(struct xfrm_user_expire)) + nla_total_size(sizeof(struct xfrm_mark)); } static int build_expire(struct sk_buff *skb, struct xfrm_state *x, const struct km_event *c) { struct xfrm_user_expire *ue; struct nlmsghdr *nlh; int err; nlh = nlmsg_put(skb, c->portid, 0, XFRM_MSG_EXPIRE, sizeof(*ue), 0); if (nlh == NULL) return -EMSGSIZE; ue = nlmsg_data(nlh); copy_to_user_state(x, &ue->state); ue->hard = (c->data.hard != 0) ? 1 : 0; /* clear the padding bytes */ memset_after(ue, 0, hard); err = xfrm_mark_put(skb, &x->mark); if (err) return err; err = xfrm_if_id_put(skb, x->if_id); if (err) return err; nlmsg_end(skb, nlh); return 0; } static int xfrm_exp_state_notify(struct xfrm_state *x, const struct km_event *c) { struct net *net = xs_net(x); struct sk_buff *skb; skb = nlmsg_new(xfrm_expire_msgsize(), GFP_ATOMIC); if (skb == NULL) return -ENOMEM; if (build_expire(skb, x, c) < 0) { kfree_skb(skb); return -EMSGSIZE; } return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_EXPIRE); } static int xfrm_aevent_state_notify(struct xfrm_state *x, const struct km_event *c) { struct net *net = xs_net(x); struct sk_buff *skb; int err; skb = nlmsg_new(xfrm_aevent_msgsize(x), GFP_ATOMIC); if (skb == NULL) return -ENOMEM; err = build_aevent(skb, x, c); BUG_ON(err < 0); return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_AEVENTS); } static int xfrm_notify_sa_flush(const struct km_event *c) { struct net *net = c->net; struct xfrm_usersa_flush *p; struct nlmsghdr *nlh; struct sk_buff *skb; int len = NLMSG_ALIGN(sizeof(struct xfrm_usersa_flush)); skb = nlmsg_new(len, GFP_ATOMIC); if (skb == NULL) return -ENOMEM; nlh = nlmsg_put(skb, c->portid, c->seq, XFRM_MSG_FLUSHSA, sizeof(*p), 0); if (nlh == NULL) { kfree_skb(skb); return -EMSGSIZE; } p = nlmsg_data(nlh); p->proto = c->data.proto; nlmsg_end(skb, nlh); return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_SA); } static inline unsigned int xfrm_sa_len(struct xfrm_state *x) { unsigned int l = 0; if (x->aead) l += nla_total_size(aead_len(x->aead)); if (x->aalg) { l += nla_total_size(sizeof(struct xfrm_algo) + (x->aalg->alg_key_len + 7) / 8); l += nla_total_size(xfrm_alg_auth_len(x->aalg)); } if (x->ealg) l += nla_total_size(xfrm_alg_len(x->ealg)); if (x->calg) l += nla_total_size(sizeof(*x->calg)); if (x->encap) l += nla_total_size(sizeof(*x->encap)); if (x->tfcpad) l += nla_total_size(sizeof(x->tfcpad)); if (x->replay_esn) l += nla_total_size(xfrm_replay_state_esn_len(x->replay_esn)); else l += nla_total_size(sizeof(struct xfrm_replay_state)); if (x->security) l += nla_total_size(sizeof(struct xfrm_user_sec_ctx) + x->security->ctx_len); if (x->coaddr) l += nla_total_size(sizeof(*x->coaddr)); if (x->props.extra_flags) l += nla_total_size(sizeof(x->props.extra_flags)); if (x->xso.dev) l += nla_total_size(sizeof(struct xfrm_user_offload)); if (x->props.smark.v | x->props.smark.m) { l += nla_total_size(sizeof(x->props.smark.v)); l += nla_total_size(sizeof(x->props.smark.m)); } if (x->if_id) l += nla_total_size(sizeof(x->if_id)); /* Must count x->lastused as it may become non-zero behind our back. */ l += nla_total_size_64bit(sizeof(u64)); if (x->mapping_maxage) l += nla_total_size(sizeof(x->mapping_maxage)); return l; } static int xfrm_notify_sa(struct xfrm_state *x, const struct km_event *c) { struct net *net = xs_net(x); struct xfrm_usersa_info *p; struct xfrm_usersa_id *id; struct nlmsghdr *nlh; struct sk_buff *skb; unsigned int len = xfrm_sa_len(x); unsigned int headlen; int err; headlen = sizeof(*p); if (c->event == XFRM_MSG_DELSA) { len += nla_total_size(headlen); headlen = sizeof(*id); len += nla_total_size(sizeof(struct xfrm_mark)); } len += NLMSG_ALIGN(headlen); skb = nlmsg_new(len, GFP_ATOMIC); if (skb == NULL) return -ENOMEM; nlh = nlmsg_put(skb, c->portid, c->seq, c->event, headlen, 0); err = -EMSGSIZE; if (nlh == NULL) goto out_free_skb; p = nlmsg_data(nlh); if (c->event == XFRM_MSG_DELSA) { struct nlattr *attr; id = nlmsg_data(nlh); memset(id, 0, sizeof(*id)); memcpy(&id->daddr, &x->id.daddr, sizeof(id->daddr)); id->spi = x->id.spi; id->family = x->props.family; id->proto = x->id.proto; attr = nla_reserve(skb, XFRMA_SA, sizeof(*p)); err = -EMSGSIZE; if (attr == NULL) goto out_free_skb; p = nla_data(attr); } err = copy_to_user_state_extra(x, p, skb); if (err) goto out_free_skb; nlmsg_end(skb, nlh); return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_SA); out_free_skb: kfree_skb(skb); return err; } static int xfrm_send_state_notify(struct xfrm_state *x, const struct km_event *c) { switch (c->event) { case XFRM_MSG_EXPIRE: return xfrm_exp_state_notify(x, c); case XFRM_MSG_NEWAE: return xfrm_aevent_state_notify(x, c); case XFRM_MSG_DELSA: case XFRM_MSG_UPDSA: case XFRM_MSG_NEWSA: return xfrm_notify_sa(x, c); case XFRM_MSG_FLUSHSA: return xfrm_notify_sa_flush(c); default: printk(KERN_NOTICE "xfrm_user: Unknown SA event %d\n", c->event); break; } return 0; } static inline unsigned int xfrm_acquire_msgsize(struct xfrm_state *x, struct xfrm_policy *xp) { return NLMSG_ALIGN(sizeof(struct xfrm_user_acquire)) + nla_total_size(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr) + nla_total_size(sizeof(struct xfrm_mark)) + nla_total_size(xfrm_user_sec_ctx_size(x->security)) + userpolicy_type_attrsize(); } static int build_acquire(struct sk_buff *skb, struct xfrm_state *x, struct xfrm_tmpl *xt, struct xfrm_policy *xp) { __u32 seq = xfrm_get_acqseq(); struct xfrm_user_acquire *ua; struct nlmsghdr *nlh; int err; nlh = nlmsg_put(skb, 0, 0, XFRM_MSG_ACQUIRE, sizeof(*ua), 0); if (nlh == NULL) return -EMSGSIZE; ua = nlmsg_data(nlh); memcpy(&ua->id, &x->id, sizeof(ua->id)); memcpy(&ua->saddr, &x->props.saddr, sizeof(ua->saddr)); memcpy(&ua->sel, &x->sel, sizeof(ua->sel)); copy_to_user_policy(xp, &ua->policy, XFRM_POLICY_OUT); ua->aalgos = xt->aalgos; ua->ealgos = xt->ealgos; ua->calgos = xt->calgos; ua->seq = x->km.seq = seq; err = copy_to_user_tmpl(xp, skb); if (!err) err = copy_to_user_state_sec_ctx(x, skb); if (!err) err = copy_to_user_policy_type(xp->type, skb); if (!err) err = xfrm_mark_put(skb, &xp->mark); if (!err) err = xfrm_if_id_put(skb, xp->if_id); if (!err && xp->xdo.dev) err = copy_user_offload(&xp->xdo, skb); if (err) { nlmsg_cancel(skb, nlh); return err; } nlmsg_end(skb, nlh); return 0; } static int xfrm_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *xt, struct xfrm_policy *xp) { struct net *net = xs_net(x); struct sk_buff *skb; int err; skb = nlmsg_new(xfrm_acquire_msgsize(x, xp), GFP_ATOMIC); if (skb == NULL) return -ENOMEM; err = build_acquire(skb, x, xt, xp); BUG_ON(err < 0); return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_ACQUIRE); } /* User gives us xfrm_user_policy_info followed by an array of 0 * or more templates. */ static struct xfrm_policy *xfrm_compile_policy(struct sock *sk, int opt, u8 *data, int len, int *dir) { struct net *net = sock_net(sk); struct xfrm_userpolicy_info *p = (struct xfrm_userpolicy_info *)data; struct xfrm_user_tmpl *ut = (struct xfrm_user_tmpl *) (p + 1); struct xfrm_policy *xp; int nr; switch (sk->sk_family) { case AF_INET: if (opt != IP_XFRM_POLICY) { *dir = -EOPNOTSUPP; return NULL; } break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: if (opt != IPV6_XFRM_POLICY) { *dir = -EOPNOTSUPP; return NULL; } break; #endif default: *dir = -EINVAL; return NULL; } *dir = -EINVAL; if (len < sizeof(*p) || verify_newpolicy_info(p, NULL)) return NULL; nr = ((len - sizeof(*p)) / sizeof(*ut)); if (validate_tmpl(nr, ut, p->sel.family, p->dir, NULL)) return NULL; if (p->dir > XFRM_POLICY_OUT) return NULL; xp = xfrm_policy_alloc(net, GFP_ATOMIC); if (xp == NULL) { *dir = -ENOBUFS; return NULL; } copy_from_user_policy(xp, p); xp->type = XFRM_POLICY_TYPE_MAIN; copy_templates(xp, ut, nr); *dir = p->dir; return xp; } static inline unsigned int xfrm_polexpire_msgsize(struct xfrm_policy *xp) { return NLMSG_ALIGN(sizeof(struct xfrm_user_polexpire)) + nla_total_size(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr) + nla_total_size(xfrm_user_sec_ctx_size(xp->security)) + nla_total_size(sizeof(struct xfrm_mark)) + userpolicy_type_attrsize(); } static int build_polexpire(struct sk_buff *skb, struct xfrm_policy *xp, int dir, const struct km_event *c) { struct xfrm_user_polexpire *upe; int hard = c->data.hard; struct nlmsghdr *nlh; int err; nlh = nlmsg_put(skb, c->portid, 0, XFRM_MSG_POLEXPIRE, sizeof(*upe), 0); if (nlh == NULL) return -EMSGSIZE; upe = nlmsg_data(nlh); copy_to_user_policy(xp, &upe->pol, dir); err = copy_to_user_tmpl(xp, skb); if (!err) err = copy_to_user_sec_ctx(xp, skb); if (!err) err = copy_to_user_policy_type(xp->type, skb); if (!err) err = xfrm_mark_put(skb, &xp->mark); if (!err) err = xfrm_if_id_put(skb, xp->if_id); if (!err && xp->xdo.dev) err = copy_user_offload(&xp->xdo, skb); if (err) { nlmsg_cancel(skb, nlh); return err; } upe->hard = !!hard; nlmsg_end(skb, nlh); return 0; } static int xfrm_exp_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c) { struct net *net = xp_net(xp); struct sk_buff *skb; int err; skb = nlmsg_new(xfrm_polexpire_msgsize(xp), GFP_ATOMIC); if (skb == NULL) return -ENOMEM; err = build_polexpire(skb, xp, dir, c); BUG_ON(err < 0); return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_EXPIRE); } static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, const struct km_event *c) { unsigned int len = nla_total_size(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr); struct net *net = xp_net(xp); struct xfrm_userpolicy_info *p; struct xfrm_userpolicy_id *id; struct nlmsghdr *nlh; struct sk_buff *skb; unsigned int headlen; int err; headlen = sizeof(*p); if (c->event == XFRM_MSG_DELPOLICY) { len += nla_total_size(headlen); headlen = sizeof(*id); } len += userpolicy_type_attrsize(); len += nla_total_size(sizeof(struct xfrm_mark)); len += NLMSG_ALIGN(headlen); skb = nlmsg_new(len, GFP_ATOMIC); if (skb == NULL) return -ENOMEM; nlh = nlmsg_put(skb, c->portid, c->seq, c->event, headlen, 0); err = -EMSGSIZE; if (nlh == NULL) goto out_free_skb; p = nlmsg_data(nlh); if (c->event == XFRM_MSG_DELPOLICY) { struct nlattr *attr; id = nlmsg_data(nlh); memset(id, 0, sizeof(*id)); id->dir = dir; if (c->data.byid) id->index = xp->index; else memcpy(&id->sel, &xp->selector, sizeof(id->sel)); attr = nla_reserve(skb, XFRMA_POLICY, sizeof(*p)); err = -EMSGSIZE; if (attr == NULL) goto out_free_skb; p = nla_data(attr); } copy_to_user_policy(xp, p, dir); err = copy_to_user_tmpl(xp, skb); if (!err) err = copy_to_user_policy_type(xp->type, skb); if (!err) err = xfrm_mark_put(skb, &xp->mark); if (!err) err = xfrm_if_id_put(skb, xp->if_id); if (!err && xp->xdo.dev) err = copy_user_offload(&xp->xdo, skb); if (err) goto out_free_skb; nlmsg_end(skb, nlh); return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_POLICY); out_free_skb: kfree_skb(skb); return err; } static int xfrm_notify_policy_flush(const struct km_event *c) { struct net *net = c->net; struct nlmsghdr *nlh; struct sk_buff *skb; int err; skb = nlmsg_new(userpolicy_type_attrsize(), GFP_ATOMIC); if (skb == NULL) return -ENOMEM; nlh = nlmsg_put(skb, c->portid, c->seq, XFRM_MSG_FLUSHPOLICY, 0, 0); err = -EMSGSIZE; if (nlh == NULL) goto out_free_skb; err = copy_to_user_policy_type(c->data.type, skb); if (err) goto out_free_skb; nlmsg_end(skb, nlh); return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_POLICY); out_free_skb: kfree_skb(skb); return err; } static int xfrm_send_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c) { switch (c->event) { case XFRM_MSG_NEWPOLICY: case XFRM_MSG_UPDPOLICY: case XFRM_MSG_DELPOLICY: return xfrm_notify_policy(xp, dir, c); case XFRM_MSG_FLUSHPOLICY: return xfrm_notify_policy_flush(c); case XFRM_MSG_POLEXPIRE: return xfrm_exp_policy_notify(xp, dir, c); default: printk(KERN_NOTICE "xfrm_user: Unknown Policy event %d\n", c->event); } return 0; } static inline unsigned int xfrm_report_msgsize(void) { return NLMSG_ALIGN(sizeof(struct xfrm_user_report)); } static int build_report(struct sk_buff *skb, u8 proto, struct xfrm_selector *sel, xfrm_address_t *addr) { struct xfrm_user_report *ur; struct nlmsghdr *nlh; nlh = nlmsg_put(skb, 0, 0, XFRM_MSG_REPORT, sizeof(*ur), 0); if (nlh == NULL) return -EMSGSIZE; ur = nlmsg_data(nlh); ur->proto = proto; memcpy(&ur->sel, sel, sizeof(ur->sel)); if (addr) { int err = nla_put(skb, XFRMA_COADDR, sizeof(*addr), addr); if (err) { nlmsg_cancel(skb, nlh); return err; } } nlmsg_end(skb, nlh); return 0; } static int xfrm_send_report(struct net *net, u8 proto, struct xfrm_selector *sel, xfrm_address_t *addr) { struct sk_buff *skb; int err; skb = nlmsg_new(xfrm_report_msgsize(), GFP_ATOMIC); if (skb == NULL) return -ENOMEM; err = build_report(skb, proto, sel, addr); BUG_ON(err < 0); return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_REPORT); } static inline unsigned int xfrm_mapping_msgsize(void) { return NLMSG_ALIGN(sizeof(struct xfrm_user_mapping)); } static int build_mapping(struct sk_buff *skb, struct xfrm_state *x, xfrm_address_t *new_saddr, __be16 new_sport) { struct xfrm_user_mapping *um; struct nlmsghdr *nlh; nlh = nlmsg_put(skb, 0, 0, XFRM_MSG_MAPPING, sizeof(*um), 0); if (nlh == NULL) return -EMSGSIZE; um = nlmsg_data(nlh); memcpy(&um->id.daddr, &x->id.daddr, sizeof(um->id.daddr)); um->id.spi = x->id.spi; um->id.family = x->props.family; um->id.proto = x->id.proto; memcpy(&um->new_saddr, new_saddr, sizeof(um->new_saddr)); memcpy(&um->old_saddr, &x->props.saddr, sizeof(um->old_saddr)); um->new_sport = new_sport; um->old_sport = x->encap->encap_sport; um->reqid = x->props.reqid; nlmsg_end(skb, nlh); return 0; } static int xfrm_send_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport) { struct net *net = xs_net(x); struct sk_buff *skb; int err; if (x->id.proto != IPPROTO_ESP) return -EINVAL; if (!x->encap) return -EINVAL; skb = nlmsg_new(xfrm_mapping_msgsize(), GFP_ATOMIC); if (skb == NULL) return -ENOMEM; err = build_mapping(skb, x, ipaddr, sport); BUG_ON(err < 0); return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_MAPPING); } static bool xfrm_is_alive(const struct km_event *c) { return (bool)xfrm_acquire_is_on(c->net); } static struct xfrm_mgr netlink_mgr = { .notify = xfrm_send_state_notify, .acquire = xfrm_send_acquire, .compile_policy = xfrm_compile_policy, .notify_policy = xfrm_send_policy_notify, .report = xfrm_send_report, .migrate = xfrm_send_migrate, .new_mapping = xfrm_send_mapping, .is_alive = xfrm_is_alive, }; static int __net_init xfrm_user_net_init(struct net *net) { struct sock *nlsk; struct netlink_kernel_cfg cfg = { .groups = XFRMNLGRP_MAX, .input = xfrm_netlink_rcv, }; nlsk = netlink_kernel_create(net, NETLINK_XFRM, &cfg); if (nlsk == NULL) return -ENOMEM; net->xfrm.nlsk_stash = nlsk; /* Don't set to NULL */ rcu_assign_pointer(net->xfrm.nlsk, nlsk); return 0; } static void __net_exit xfrm_user_net_pre_exit(struct net *net) { RCU_INIT_POINTER(net->xfrm.nlsk, NULL); } static void __net_exit xfrm_user_net_exit(struct list_head *net_exit_list) { struct net *net; list_for_each_entry(net, net_exit_list, exit_list) netlink_kernel_release(net->xfrm.nlsk_stash); } static struct pernet_operations xfrm_user_net_ops = { .init = xfrm_user_net_init, .pre_exit = xfrm_user_net_pre_exit, .exit_batch = xfrm_user_net_exit, }; static int __init xfrm_user_init(void) { int rv; printk(KERN_INFO "Initializing XFRM netlink socket\n"); rv = register_pernet_subsys(&xfrm_user_net_ops); if (rv < 0) return rv; xfrm_register_km(&netlink_mgr); return 0; } static void __exit xfrm_user_exit(void) { xfrm_unregister_km(&netlink_mgr); unregister_pernet_subsys(&xfrm_user_net_ops); } module_init(xfrm_user_init); module_exit(xfrm_user_exit); MODULE_LICENSE("GPL"); MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_XFRM); |
1528 87 87 22 1528 1490 1490 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MIN_HEAP_H #define _LINUX_MIN_HEAP_H #include <linux/bug.h> #include <linux/string.h> #include <linux/types.h> /** * struct min_heap - Data structure to hold a min-heap. * @data: Start of array holding the heap elements. * @nr: Number of elements currently in the heap. * @size: Maximum number of elements that can be held in current storage. */ struct min_heap { void *data; int nr; int size; }; /** * struct min_heap_callbacks - Data/functions to customise the min_heap. * @elem_size: The nr of each element in bytes. * @less: Partial order function for this heap. * @swp: Swap elements function. */ struct min_heap_callbacks { int elem_size; bool (*less)(const void *lhs, const void *rhs); void (*swp)(void *lhs, void *rhs); }; /* Sift the element at pos down the heap. */ static __always_inline void min_heapify(struct min_heap *heap, int pos, const struct min_heap_callbacks *func) { void *left, *right, *parent, *smallest; void *data = heap->data; for (;;) { if (pos * 2 + 1 >= heap->nr) break; left = data + ((pos * 2 + 1) * func->elem_size); parent = data + (pos * func->elem_size); smallest = parent; if (func->less(left, smallest)) smallest = left; if (pos * 2 + 2 < heap->nr) { right = data + ((pos * 2 + 2) * func->elem_size); if (func->less(right, smallest)) smallest = right; } if (smallest == parent) break; func->swp(smallest, parent); if (smallest == left) pos = (pos * 2) + 1; else pos = (pos * 2) + 2; } } /* Floyd's approach to heapification that is O(nr). */ static __always_inline void min_heapify_all(struct min_heap *heap, const struct min_heap_callbacks *func) { int i; for (i = heap->nr / 2; i >= 0; i--) min_heapify(heap, i, func); } /* Remove minimum element from the heap, O(log2(nr)). */ static __always_inline void min_heap_pop(struct min_heap *heap, const struct min_heap_callbacks *func) { void *data = heap->data; if (WARN_ONCE(heap->nr <= 0, "Popping an empty heap")) return; /* Place last element at the root (position 0) and then sift down. */ heap->nr--; memcpy(data, data + (heap->nr * func->elem_size), func->elem_size); min_heapify(heap, 0, func); } /* * Remove the minimum element and then push the given element. The * implementation performs 1 sift (O(log2(nr))) and is therefore more * efficient than a pop followed by a push that does 2. */ static __always_inline void min_heap_pop_push(struct min_heap *heap, const void *element, const struct min_heap_callbacks *func) { memcpy(heap->data, element, func->elem_size); min_heapify(heap, 0, func); } /* Push an element on to the heap, O(log2(nr)). */ static __always_inline void min_heap_push(struct min_heap *heap, const void *element, const struct min_heap_callbacks *func) { void *data = heap->data; void *child, *parent; int pos; if (WARN_ONCE(heap->nr >= heap->size, "Pushing on a full heap")) return; /* Place at the end of data. */ pos = heap->nr; memcpy(data + (pos * func->elem_size), element, func->elem_size); heap->nr++; /* Sift child at pos up. */ for (; pos > 0; pos = (pos - 1) / 2) { child = data + (pos * func->elem_size); parent = data + ((pos - 1) / 2) * func->elem_size; if (func->less(parent, child)) break; func->swp(parent, child); } } #endif /* _LINUX_MIN_HEAP_H */ |
4 52 52 52 52 52 52 52 52 52 52 52 50 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/ethtool_netlink.h> #include <linux/bitmap.h> #include "netlink.h" #include "bitset.h" /* Some bitmaps are internally represented as an array of unsigned long, some * as an array of u32 (some even as single u32 for now). To avoid the need of * wrappers on caller side, we provide two set of functions: those with "32" * suffix in their names expect u32 based bitmaps, those without it expect * unsigned long bitmaps. */ static u32 ethnl_lower_bits(unsigned int n) { return ~(u32)0 >> (32 - n % 32); } static u32 ethnl_upper_bits(unsigned int n) { return ~(u32)0 << (n % 32); } /** * ethnl_bitmap32_clear() - Clear u32 based bitmap * @dst: bitmap to clear * @start: beginning of the interval * @end: end of the interval * @mod: set if bitmap was modified * * Clear @nbits bits of a bitmap with indices @start <= i < @end */ static void ethnl_bitmap32_clear(u32 *dst, unsigned int start, unsigned int end, bool *mod) { unsigned int start_word = start / 32; unsigned int end_word = end / 32; unsigned int i; u32 mask; if (end <= start) return; if (start % 32) { mask = ethnl_upper_bits(start); if (end_word == start_word) { mask &= ethnl_lower_bits(end); if (dst[start_word] & mask) { dst[start_word] &= ~mask; *mod = true; } return; } if (dst[start_word] & mask) { dst[start_word] &= ~mask; *mod = true; } start_word++; } for (i = start_word; i < end_word; i++) { if (dst[i]) { dst[i] = 0; *mod = true; } } if (end % 32) { mask = ethnl_lower_bits(end); if (dst[end_word] & mask) { dst[end_word] &= ~mask; *mod = true; } } } /** * ethnl_bitmap32_not_zero() - Check if any bit is set in an interval * @map: bitmap to test * @start: beginning of the interval * @end: end of the interval * * Return: true if there is non-zero bit with index @start <= i < @end, * false if the whole interval is zero */ static bool ethnl_bitmap32_not_zero(const u32 *map, unsigned int start, unsigned int end) { unsigned int start_word = start / 32; unsigned int end_word = end / 32; u32 mask; if (end <= start) return true; if (start % 32) { mask = ethnl_upper_bits(start); if (end_word == start_word) { mask &= ethnl_lower_bits(end); return map[start_word] & mask; } if (map[start_word] & mask) return true; start_word++; } if (!memchr_inv(map + start_word, '\0', (end_word - start_word) * sizeof(u32))) return true; if (end % 32 == 0) return true; return map[end_word] & ethnl_lower_bits(end); } /** * ethnl_bitmap32_update() - Modify u32 based bitmap according to value/mask * pair * @dst: bitmap to update * @nbits: bit size of the bitmap * @value: values to set * @mask: mask of bits to set * @mod: set to true if bitmap is modified, preserve if not * * Set bits in @dst bitmap which are set in @mask to values from @value, leave * the rest untouched. If destination bitmap was modified, set @mod to true, * leave as it is if not. */ static void ethnl_bitmap32_update(u32 *dst, unsigned int nbits, const u32 *value, const u32 *mask, bool *mod) { while (nbits > 0) { u32 real_mask = mask ? *mask : ~(u32)0; u32 new_value; if (nbits < 32) real_mask &= ethnl_lower_bits(nbits); new_value = (*dst & ~real_mask) | (*value & real_mask); if (new_value != *dst) { *dst = new_value; *mod = true; } if (nbits <= 32) break; dst++; nbits -= 32; value++; if (mask) mask++; } } static bool ethnl_bitmap32_test_bit(const u32 *map, unsigned int index) { return map[index / 32] & (1U << (index % 32)); } /** * ethnl_bitset32_size() - Calculate size of bitset nested attribute * @val: value bitmap (u32 based) * @mask: mask bitmap (u32 based, optional) * @nbits: bit length of the bitset * @names: array of bit names (optional) * @compact: assume compact format for output * * Estimate length of netlink attribute composed by a later call to * ethnl_put_bitset32() call with the same arguments. * * Return: negative error code or attribute length estimate */ int ethnl_bitset32_size(const u32 *val, const u32 *mask, unsigned int nbits, ethnl_string_array_t names, bool compact) { unsigned int len = 0; /* list flag */ if (!mask) len += nla_total_size(sizeof(u32)); /* size */ len += nla_total_size(sizeof(u32)); if (compact) { unsigned int nwords = DIV_ROUND_UP(nbits, 32); /* value, mask */ len += (mask ? 2 : 1) * nla_total_size(nwords * sizeof(u32)); } else { unsigned int bits_len = 0; unsigned int bit_len, i; for (i = 0; i < nbits; i++) { const char *name = names ? names[i] : NULL; if (!ethnl_bitmap32_test_bit(mask ?: val, i)) continue; /* index */ bit_len = nla_total_size(sizeof(u32)); /* name */ if (name) bit_len += ethnl_strz_size(name); /* value */ if (mask && ethnl_bitmap32_test_bit(val, i)) bit_len += nla_total_size(0); /* bit nest */ bits_len += nla_total_size(bit_len); } /* bits nest */ len += nla_total_size(bits_len); } /* outermost nest */ return nla_total_size(len); } /** * ethnl_put_bitset32() - Put a bitset nest into a message * @skb: skb with the message * @attrtype: attribute type for the bitset nest * @val: value bitmap (u32 based) * @mask: mask bitmap (u32 based, optional) * @nbits: bit length of the bitset * @names: array of bit names (optional) * @compact: use compact format for the output * * Compose a nested attribute representing a bitset. If @mask is null, simple * bitmap (bit list) is created, if @mask is provided, represent a value/mask * pair. Bit names are only used in verbose mode and when provided by calller. * * Return: 0 on success, negative error value on error */ int ethnl_put_bitset32(struct sk_buff *skb, int attrtype, const u32 *val, const u32 *mask, unsigned int nbits, ethnl_string_array_t names, bool compact) { struct nlattr *nest; struct nlattr *attr; nest = nla_nest_start(skb, attrtype); if (!nest) return -EMSGSIZE; if (!mask && nla_put_flag(skb, ETHTOOL_A_BITSET_NOMASK)) goto nla_put_failure; if (nla_put_u32(skb, ETHTOOL_A_BITSET_SIZE, nbits)) goto nla_put_failure; if (compact) { unsigned int nwords = DIV_ROUND_UP(nbits, 32); unsigned int nbytes = nwords * sizeof(u32); u32 *dst; attr = nla_reserve(skb, ETHTOOL_A_BITSET_VALUE, nbytes); if (!attr) goto nla_put_failure; dst = nla_data(attr); memcpy(dst, val, nbytes); if (nbits % 32) dst[nwords - 1] &= ethnl_lower_bits(nbits); if (mask) { attr = nla_reserve(skb, ETHTOOL_A_BITSET_MASK, nbytes); if (!attr) goto nla_put_failure; dst = nla_data(attr); memcpy(dst, mask, nbytes); if (nbits % 32) dst[nwords - 1] &= ethnl_lower_bits(nbits); } } else { struct nlattr *bits; unsigned int i; bits = nla_nest_start(skb, ETHTOOL_A_BITSET_BITS); if (!bits) goto nla_put_failure; for (i = 0; i < nbits; i++) { const char *name = names ? names[i] : NULL; if (!ethnl_bitmap32_test_bit(mask ?: val, i)) continue; attr = nla_nest_start(skb, ETHTOOL_A_BITSET_BITS_BIT); if (!attr) goto nla_put_failure; if (nla_put_u32(skb, ETHTOOL_A_BITSET_BIT_INDEX, i)) goto nla_put_failure; if (name && ethnl_put_strz(skb, ETHTOOL_A_BITSET_BIT_NAME, name)) goto nla_put_failure; if (mask && ethnl_bitmap32_test_bit(val, i) && nla_put_flag(skb, ETHTOOL_A_BITSET_BIT_VALUE)) goto nla_put_failure; nla_nest_end(skb, attr); } nla_nest_end(skb, bits); } nla_nest_end(skb, nest); return 0; nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static const struct nla_policy bitset_policy[] = { [ETHTOOL_A_BITSET_NOMASK] = { .type = NLA_FLAG }, [ETHTOOL_A_BITSET_SIZE] = NLA_POLICY_MAX(NLA_U32, ETHNL_MAX_BITSET_SIZE), [ETHTOOL_A_BITSET_BITS] = { .type = NLA_NESTED }, [ETHTOOL_A_BITSET_VALUE] = { .type = NLA_BINARY }, [ETHTOOL_A_BITSET_MASK] = { .type = NLA_BINARY }, }; static const struct nla_policy bit_policy[] = { [ETHTOOL_A_BITSET_BIT_INDEX] = { .type = NLA_U32 }, [ETHTOOL_A_BITSET_BIT_NAME] = { .type = NLA_NUL_STRING }, [ETHTOOL_A_BITSET_BIT_VALUE] = { .type = NLA_FLAG }, }; /** * ethnl_bitset_is_compact() - check if bitset attribute represents a compact * bitset * @bitset: nested attribute representing a bitset * @compact: pointer for return value * * Return: 0 on success, negative error code on failure */ int ethnl_bitset_is_compact(const struct nlattr *bitset, bool *compact) { struct nlattr *tb[ARRAY_SIZE(bitset_policy)]; int ret; ret = nla_parse_nested(tb, ARRAY_SIZE(bitset_policy) - 1, bitset, bitset_policy, NULL); if (ret < 0) return ret; if (tb[ETHTOOL_A_BITSET_BITS]) { if (tb[ETHTOOL_A_BITSET_VALUE] || tb[ETHTOOL_A_BITSET_MASK]) return -EINVAL; *compact = false; return 0; } if (!tb[ETHTOOL_A_BITSET_SIZE] || !tb[ETHTOOL_A_BITSET_VALUE]) return -EINVAL; *compact = true; return 0; } /** * ethnl_name_to_idx() - look up string index for a name * @names: array of ETH_GSTRING_LEN sized strings * @n_names: number of strings in the array * @name: name to look up * * Return: index of the string if found, -ENOENT if not found */ static int ethnl_name_to_idx(ethnl_string_array_t names, unsigned int n_names, const char *name) { unsigned int i; if (!names) return -ENOENT; for (i = 0; i < n_names; i++) { /* names[i] may not be null terminated */ if (!strncmp(names[i], name, ETH_GSTRING_LEN) && strlen(name) <= ETH_GSTRING_LEN) return i; } return -ENOENT; } static int ethnl_parse_bit(unsigned int *index, bool *val, unsigned int nbits, const struct nlattr *bit_attr, bool no_mask, ethnl_string_array_t names, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(bit_policy)]; int ret, idx; ret = nla_parse_nested(tb, ARRAY_SIZE(bit_policy) - 1, bit_attr, bit_policy, extack); if (ret < 0) return ret; if (tb[ETHTOOL_A_BITSET_BIT_INDEX]) { const char *name; idx = nla_get_u32(tb[ETHTOOL_A_BITSET_BIT_INDEX]); if (idx >= nbits) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_BIT_INDEX], "bit index too high"); return -EOPNOTSUPP; } name = names ? names[idx] : NULL; if (tb[ETHTOOL_A_BITSET_BIT_NAME] && name && strncmp(nla_data(tb[ETHTOOL_A_BITSET_BIT_NAME]), name, nla_len(tb[ETHTOOL_A_BITSET_BIT_NAME]))) { NL_SET_ERR_MSG_ATTR(extack, bit_attr, "bit index and name mismatch"); return -EINVAL; } } else if (tb[ETHTOOL_A_BITSET_BIT_NAME]) { idx = ethnl_name_to_idx(names, nbits, nla_data(tb[ETHTOOL_A_BITSET_BIT_NAME])); if (idx < 0) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_BIT_NAME], "bit name not found"); return -EOPNOTSUPP; } } else { NL_SET_ERR_MSG_ATTR(extack, bit_attr, "neither bit index nor name specified"); return -EINVAL; } *index = idx; *val = no_mask || tb[ETHTOOL_A_BITSET_BIT_VALUE]; return 0; } static int ethnl_update_bitset32_verbose(u32 *bitmap, unsigned int nbits, const struct nlattr *attr, struct nlattr **tb, ethnl_string_array_t names, struct netlink_ext_ack *extack, bool *mod) { struct nlattr *bit_attr; bool no_mask; int rem; int ret; if (tb[ETHTOOL_A_BITSET_VALUE]) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_VALUE], "value only allowed in compact bitset"); return -EINVAL; } if (tb[ETHTOOL_A_BITSET_MASK]) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK], "mask only allowed in compact bitset"); return -EINVAL; } no_mask = tb[ETHTOOL_A_BITSET_NOMASK]; if (no_mask) ethnl_bitmap32_clear(bitmap, 0, nbits, mod); nla_for_each_nested(bit_attr, tb[ETHTOOL_A_BITSET_BITS], rem) { bool old_val, new_val; unsigned int idx; if (nla_type(bit_attr) != ETHTOOL_A_BITSET_BITS_BIT) { NL_SET_ERR_MSG_ATTR(extack, bit_attr, "only ETHTOOL_A_BITSET_BITS_BIT allowed in ETHTOOL_A_BITSET_BITS"); return -EINVAL; } ret = ethnl_parse_bit(&idx, &new_val, nbits, bit_attr, no_mask, names, extack); if (ret < 0) return ret; old_val = bitmap[idx / 32] & ((u32)1 << (idx % 32)); if (new_val != old_val) { if (new_val) bitmap[idx / 32] |= ((u32)1 << (idx % 32)); else bitmap[idx / 32] &= ~((u32)1 << (idx % 32)); *mod = true; } } return 0; } static int ethnl_compact_sanity_checks(unsigned int nbits, const struct nlattr *nest, struct nlattr **tb, struct netlink_ext_ack *extack) { bool no_mask = tb[ETHTOOL_A_BITSET_NOMASK]; unsigned int attr_nbits, attr_nwords; const struct nlattr *test_attr; if (no_mask && tb[ETHTOOL_A_BITSET_MASK]) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK], "mask not allowed in list bitset"); return -EINVAL; } if (!tb[ETHTOOL_A_BITSET_SIZE]) { NL_SET_ERR_MSG_ATTR(extack, nest, "missing size in compact bitset"); return -EINVAL; } if (!tb[ETHTOOL_A_BITSET_VALUE]) { NL_SET_ERR_MSG_ATTR(extack, nest, "missing value in compact bitset"); return -EINVAL; } if (!no_mask && !tb[ETHTOOL_A_BITSET_MASK]) { NL_SET_ERR_MSG_ATTR(extack, nest, "missing mask in compact nonlist bitset"); return -EINVAL; } attr_nbits = nla_get_u32(tb[ETHTOOL_A_BITSET_SIZE]); attr_nwords = DIV_ROUND_UP(attr_nbits, 32); if (nla_len(tb[ETHTOOL_A_BITSET_VALUE]) != attr_nwords * sizeof(u32)) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_VALUE], "bitset value length does not match size"); return -EINVAL; } if (tb[ETHTOOL_A_BITSET_MASK] && nla_len(tb[ETHTOOL_A_BITSET_MASK]) != attr_nwords * sizeof(u32)) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK], "bitset mask length does not match size"); return -EINVAL; } if (attr_nbits <= nbits) return 0; test_attr = no_mask ? tb[ETHTOOL_A_BITSET_VALUE] : tb[ETHTOOL_A_BITSET_MASK]; if (ethnl_bitmap32_not_zero(nla_data(test_attr), nbits, attr_nbits)) { NL_SET_ERR_MSG_ATTR(extack, test_attr, "cannot modify bits past kernel bitset size"); return -EINVAL; } return 0; } /** * ethnl_update_bitset32() - Apply a bitset nest to a u32 based bitmap * @bitmap: bitmap to update * @nbits: size of the updated bitmap in bits * @attr: nest attribute to parse and apply * @names: array of bit names; may be null for compact format * @extack: extack for error reporting * @mod: set this to true if bitmap is modified, leave as it is if not * * Apply bitset netsted attribute to a bitmap. If the attribute represents * a bit list, @bitmap is set to its contents; otherwise, bits in mask are * set to values from value. Bitmaps in the attribute may be longer than * @nbits but the message must not request modifying any bits past @nbits. * * Return: negative error code on failure, 0 on success */ int ethnl_update_bitset32(u32 *bitmap, unsigned int nbits, const struct nlattr *attr, ethnl_string_array_t names, struct netlink_ext_ack *extack, bool *mod) { struct nlattr *tb[ARRAY_SIZE(bitset_policy)]; unsigned int change_bits; bool no_mask; int ret; if (!attr) return 0; ret = nla_parse_nested(tb, ARRAY_SIZE(bitset_policy) - 1, attr, bitset_policy, extack); if (ret < 0) return ret; if (tb[ETHTOOL_A_BITSET_BITS]) return ethnl_update_bitset32_verbose(bitmap, nbits, attr, tb, names, extack, mod); ret = ethnl_compact_sanity_checks(nbits, attr, tb, extack); if (ret < 0) return ret; no_mask = tb[ETHTOOL_A_BITSET_NOMASK]; change_bits = min_t(unsigned int, nla_get_u32(tb[ETHTOOL_A_BITSET_SIZE]), nbits); ethnl_bitmap32_update(bitmap, change_bits, nla_data(tb[ETHTOOL_A_BITSET_VALUE]), no_mask ? NULL : nla_data(tb[ETHTOOL_A_BITSET_MASK]), mod); if (no_mask && change_bits < nbits) ethnl_bitmap32_clear(bitmap, change_bits, nbits, mod); return 0; } /** * ethnl_parse_bitset() - Compute effective value and mask from bitset nest * @val: unsigned long based bitmap to put value into * @mask: unsigned long based bitmap to put mask into * @nbits: size of @val and @mask bitmaps * @attr: nest attribute to parse and apply * @names: array of bit names; may be null for compact format * @extack: extack for error reporting * * Provide @nbits size long bitmaps for value and mask so that * x = (val & mask) | (x & ~mask) would modify any @nbits sized bitmap x * the same way ethnl_update_bitset() with the same bitset attribute would. * * Return: negative error code on failure, 0 on success */ int ethnl_parse_bitset(unsigned long *val, unsigned long *mask, unsigned int nbits, const struct nlattr *attr, ethnl_string_array_t names, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(bitset_policy)]; const struct nlattr *bit_attr; bool no_mask; int rem; int ret; if (!attr) return 0; ret = nla_parse_nested(tb, ARRAY_SIZE(bitset_policy) - 1, attr, bitset_policy, extack); if (ret < 0) return ret; no_mask = tb[ETHTOOL_A_BITSET_NOMASK]; if (!tb[ETHTOOL_A_BITSET_BITS]) { unsigned int change_bits; ret = ethnl_compact_sanity_checks(nbits, attr, tb, extack); if (ret < 0) return ret; change_bits = nla_get_u32(tb[ETHTOOL_A_BITSET_SIZE]); if (change_bits > nbits) change_bits = nbits; bitmap_from_arr32(val, nla_data(tb[ETHTOOL_A_BITSET_VALUE]), change_bits); if (change_bits < nbits) bitmap_clear(val, change_bits, nbits - change_bits); if (no_mask) { bitmap_fill(mask, nbits); } else { bitmap_from_arr32(mask, nla_data(tb[ETHTOOL_A_BITSET_MASK]), change_bits); if (change_bits < nbits) bitmap_clear(mask, change_bits, nbits - change_bits); } return 0; } if (tb[ETHTOOL_A_BITSET_VALUE]) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_VALUE], "value only allowed in compact bitset"); return -EINVAL; } if (tb[ETHTOOL_A_BITSET_MASK]) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK], "mask only allowed in compact bitset"); return -EINVAL; } bitmap_zero(val, nbits); if (no_mask) bitmap_fill(mask, nbits); else bitmap_zero(mask, nbits); nla_for_each_nested(bit_attr, tb[ETHTOOL_A_BITSET_BITS], rem) { unsigned int idx; bool bit_val; ret = ethnl_parse_bit(&idx, &bit_val, nbits, bit_attr, no_mask, names, extack); if (ret < 0) return ret; if (bit_val) __set_bit(idx, val); if (!no_mask) __set_bit(idx, mask); } return 0; } #if BITS_PER_LONG == 64 && defined(__BIG_ENDIAN) /* 64-bit big endian architectures are the only case when u32 based bitmaps * and unsigned long based bitmaps have different memory layout so that we * cannot simply cast the latter to the former and need actual wrappers * converting the latter to the former. * * To reduce the number of slab allocations, the wrappers use fixed size local * variables for bitmaps up to ETHNL_SMALL_BITMAP_BITS bits which is the * majority of bitmaps used by ethtool. */ #define ETHNL_SMALL_BITMAP_BITS 128 #define ETHNL_SMALL_BITMAP_WORDS DIV_ROUND_UP(ETHNL_SMALL_BITMAP_BITS, 32) int ethnl_bitset_size(const unsigned long *val, const unsigned long *mask, unsigned int nbits, ethnl_string_array_t names, bool compact) { u32 small_mask32[ETHNL_SMALL_BITMAP_WORDS]; u32 small_val32[ETHNL_SMALL_BITMAP_WORDS]; u32 *mask32; u32 *val32; int ret; if (nbits > ETHNL_SMALL_BITMAP_BITS) { unsigned int nwords = DIV_ROUND_UP(nbits, 32); val32 = kmalloc_array(2 * nwords, sizeof(u32), GFP_KERNEL); if (!val32) return -ENOMEM; mask32 = val32 + nwords; } else { val32 = small_val32; mask32 = small_mask32; } bitmap_to_arr32(val32, val, nbits); if (mask) bitmap_to_arr32(mask32, mask, nbits); else mask32 = NULL; ret = ethnl_bitset32_size(val32, mask32, nbits, names, compact); if (nbits > ETHNL_SMALL_BITMAP_BITS) kfree(val32); return ret; } int ethnl_put_bitset(struct sk_buff *skb, int attrtype, const unsigned long *val, const unsigned long *mask, unsigned int nbits, ethnl_string_array_t names, bool compact) { u32 small_mask32[ETHNL_SMALL_BITMAP_WORDS]; u32 small_val32[ETHNL_SMALL_BITMAP_WORDS]; u32 *mask32; u32 *val32; int ret; if (nbits > ETHNL_SMALL_BITMAP_BITS) { unsigned int nwords = DIV_ROUND_UP(nbits, 32); val32 = kmalloc_array(2 * nwords, sizeof(u32), GFP_KERNEL); if (!val32) return -ENOMEM; mask32 = val32 + nwords; } else { val32 = small_val32; mask32 = small_mask32; } bitmap_to_arr32(val32, val, nbits); if (mask) bitmap_to_arr32(mask32, mask, nbits); else mask32 = NULL; ret = ethnl_put_bitset32(skb, attrtype, val32, mask32, nbits, names, compact); if (nbits > ETHNL_SMALL_BITMAP_BITS) kfree(val32); return ret; } int ethnl_update_bitset(unsigned long *bitmap, unsigned int nbits, const struct nlattr *attr, ethnl_string_array_t names, struct netlink_ext_ack *extack, bool *mod) { u32 small_bitmap32[ETHNL_SMALL_BITMAP_WORDS]; u32 *bitmap32 = small_bitmap32; bool u32_mod = false; int ret; if (nbits > ETHNL_SMALL_BITMAP_BITS) { unsigned int dst_words = DIV_ROUND_UP(nbits, 32); bitmap32 = kmalloc_array(dst_words, sizeof(u32), GFP_KERNEL); if (!bitmap32) return -ENOMEM; } bitmap_to_arr32(bitmap32, bitmap, nbits); ret = ethnl_update_bitset32(bitmap32, nbits, attr, names, extack, &u32_mod); if (u32_mod) { bitmap_from_arr32(bitmap, bitmap32, nbits); *mod = true; } if (nbits > ETHNL_SMALL_BITMAP_BITS) kfree(bitmap32); return ret; } #else /* On little endian 64-bit and all 32-bit architectures, an unsigned long * based bitmap can be interpreted as u32 based one using a simple cast. */ int ethnl_bitset_size(const unsigned long *val, const unsigned long *mask, unsigned int nbits, ethnl_string_array_t names, bool compact) { return ethnl_bitset32_size((const u32 *)val, (const u32 *)mask, nbits, names, compact); } int ethnl_put_bitset(struct sk_buff *skb, int attrtype, const unsigned long *val, const unsigned long *mask, unsigned int nbits, ethnl_string_array_t names, bool compact) { return ethnl_put_bitset32(skb, attrtype, (const u32 *)val, (const u32 *)mask, nbits, names, compact); } int ethnl_update_bitset(unsigned long *bitmap, unsigned int nbits, const struct nlattr *attr, ethnl_string_array_t names, struct netlink_ext_ack *extack, bool *mod) { return ethnl_update_bitset32((u32 *)bitmap, nbits, attr, names, extack, mod); } #endif /* BITS_PER_LONG == 64 && defined(__BIG_ENDIAN) */ |
462 462 462 462 462 462 2 461 461 461 462 343 343 343 343 132 132 132 462 340 340 338 340 338 340 340 340 340 450 450 450 340 450 450 4 1 1 4 131 132 132 132 169 169 169 132 169 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/proc/base.c * * Copyright (C) 1991, 1992 Linus Torvalds * * proc base directory handling functions * * 1999, Al Viro. Rewritten. Now it covers the whole per-process part. * Instead of using magical inumbers to determine the kind of object * we allocate and fill in-core inodes upon lookup. They don't even * go into icache. We cache the reference to task_struct upon lookup too. * Eventually it should become a filesystem in its own. We don't use the * rest of procfs anymore. * * * Changelog: * 17-Jan-2005 * Allan Bezerra * Bruna Moreira <bruna.moreira@indt.org.br> * Edjard Mota <edjard.mota@indt.org.br> * Ilias Biris <ilias.biris@indt.org.br> * Mauricio Lin <mauricio.lin@indt.org.br> * * Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT * * A new process specific entry (smaps) included in /proc. It shows the * size of rss for each memory area. The maps entry lacks information * about physical memory size (rss) for each mapped file, i.e., * rss information for executables and library files. * This additional information is useful for any tools that need to know * about physical memory consumption for a process specific library. * * Changelog: * 21-Feb-2005 * Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT * Pud inclusion in the page table walking. * * ChangeLog: * 10-Mar-2005 * 10LE Instituto Nokia de Tecnologia - INdT: * A better way to walks through the page table as suggested by Hugh Dickins. * * Simo Piiroinen <simo.piiroinen@nokia.com>: * Smaps information related to shared, private, clean and dirty pages. * * Paul Mundt <paul.mundt@nokia.com>: * Overall revision about smaps. */ #include <linux/uaccess.h> #include <linux/errno.h> #include <linux/time.h> #include <linux/proc_fs.h> #include <linux/stat.h> #include <linux/task_io_accounting_ops.h> #include <linux/init.h> #include <linux/capability.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/generic-radix-tree.h> #include <linux/string.h> #include <linux/seq_file.h> #include <linux/namei.h> #include <linux/mnt_namespace.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/rcupdate.h> #include <linux/kallsyms.h> #include <linux/stacktrace.h> #include <linux/resource.h> #include <linux/module.h> #include <linux/mount.h> #include <linux/security.h> #include <linux/ptrace.h> #include <linux/printk.h> #include <linux/cache.h> #include <linux/cgroup.h> #include <linux/cpuset.h> #include <linux/audit.h> #include <linux/poll.h> #include <linux/nsproxy.h> #include <linux/oom.h> #include <linux/elf.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> #include <linux/fs_struct.h> #include <linux/slab.h> #include <linux/sched/autogroup.h> #include <linux/sched/mm.h> #include <linux/sched/coredump.h> #include <linux/sched/debug.h> #include <linux/sched/stat.h> #include <linux/posix-timers.h> #include <linux/time_namespace.h> #include <linux/resctrl.h> #include <linux/cn_proc.h> #include <linux/ksm.h> #include <trace/events/oom.h> #include "internal.h" #include "fd.h" #include "../../lib/kstrtox.h" /* NOTE: * Implementing inode permission operations in /proc is almost * certainly an error. Permission checks need to happen during * each system call not at open time. The reason is that most of * what we wish to check for permissions in /proc varies at runtime. * * The classic example of a problem is opening file descriptors * in /proc for a task before it execs a suid executable. */ static u8 nlink_tid __ro_after_init; static u8 nlink_tgid __ro_after_init; struct pid_entry { const char *name; unsigned int len; umode_t mode; const struct inode_operations *iop; const struct file_operations *fop; union proc_op op; }; #define NOD(NAME, MODE, IOP, FOP, OP) { \ .name = (NAME), \ .len = sizeof(NAME) - 1, \ .mode = MODE, \ .iop = IOP, \ .fop = FOP, \ .op = OP, \ } #define DIR(NAME, MODE, iops, fops) \ NOD(NAME, (S_IFDIR|(MODE)), &iops, &fops, {} ) #define LNK(NAME, get_link) \ NOD(NAME, (S_IFLNK|S_IRWXUGO), \ &proc_pid_link_inode_operations, NULL, \ { .proc_get_link = get_link } ) #define REG(NAME, MODE, fops) \ NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {}) #define ONE(NAME, MODE, show) \ NOD(NAME, (S_IFREG|(MODE)), \ NULL, &proc_single_file_operations, \ { .proc_show = show } ) #define ATTR(LSM, NAME, MODE) \ NOD(NAME, (S_IFREG|(MODE)), \ NULL, &proc_pid_attr_operations, \ { .lsm = LSM }) /* * Count the number of hardlinks for the pid_entry table, excluding the . * and .. links. */ static unsigned int __init pid_entry_nlink(const struct pid_entry *entries, unsigned int n) { unsigned int i; unsigned int count; count = 2; for (i = 0; i < n; ++i) { if (S_ISDIR(entries[i].mode)) ++count; } return count; } static int get_task_root(struct task_struct *task, struct path *root) { int result = -ENOENT; task_lock(task); if (task->fs) { get_fs_root(task->fs, root); result = 0; } task_unlock(task); return result; } static int proc_cwd_link(struct dentry *dentry, struct path *path) { struct task_struct *task = get_proc_task(d_inode(dentry)); int result = -ENOENT; if (task) { task_lock(task); if (task->fs) { get_fs_pwd(task->fs, path); result = 0; } task_unlock(task); put_task_struct(task); } return result; } static int proc_root_link(struct dentry *dentry, struct path *path) { struct task_struct *task = get_proc_task(d_inode(dentry)); int result = -ENOENT; if (task) { result = get_task_root(task, path); put_task_struct(task); } return result; } /* * If the user used setproctitle(), we just get the string from * user space at arg_start, and limit it to a maximum of one page. */ static ssize_t get_mm_proctitle(struct mm_struct *mm, char __user *buf, size_t count, unsigned long pos, unsigned long arg_start) { char *page; int ret, got; if (pos >= PAGE_SIZE) return 0; page = (char *)__get_free_page(GFP_KERNEL); if (!page) return -ENOMEM; ret = 0; got = access_remote_vm(mm, arg_start, page, PAGE_SIZE, FOLL_ANON); if (got > 0) { int len = strnlen(page, got); /* Include the NUL character if it was found */ if (len < got) len++; if (len > pos) { len -= pos; if (len > count) len = count; len -= copy_to_user(buf, page+pos, len); if (!len) len = -EFAULT; ret = len; } } free_page((unsigned long)page); return ret; } static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf, size_t count, loff_t *ppos) { unsigned long arg_start, arg_end, env_start, env_end; unsigned long pos, len; char *page, c; /* Check if process spawned far enough to have cmdline. */ if (!mm->env_end) return 0; spin_lock(&mm->arg_lock); arg_start = mm->arg_start; arg_end = mm->arg_end; env_start = mm->env_start; env_end = mm->env_end; spin_unlock(&mm->arg_lock); if (arg_start >= arg_end) return 0; /* * We allow setproctitle() to overwrite the argument * strings, and overflow past the original end. But * only when it overflows into the environment area. */ if (env_start != arg_end || env_end < env_start) env_start = env_end = arg_end; len = env_end - arg_start; /* We're not going to care if "*ppos" has high bits set */ pos = *ppos; if (pos >= len) return 0; if (count > len - pos) count = len - pos; if (!count) return 0; /* * Magical special case: if the argv[] end byte is not * zero, the user has overwritten it with setproctitle(3). * * Possible future enhancement: do this only once when * pos is 0, and set a flag in the 'struct file'. */ if (access_remote_vm(mm, arg_end-1, &c, 1, FOLL_ANON) == 1 && c) return get_mm_proctitle(mm, buf, count, pos, arg_start); /* * For the non-setproctitle() case we limit things strictly * to the [arg_start, arg_end[ range. */ pos += arg_start; if (pos < arg_start || pos >= arg_end) return 0; if (count > arg_end - pos) count = arg_end - pos; page = (char *)__get_free_page(GFP_KERNEL); if (!page) return -ENOMEM; len = 0; while (count) { int got; size_t size = min_t(size_t, PAGE_SIZE, count); got = access_remote_vm(mm, pos, page, size, FOLL_ANON); if (got <= 0) break; got -= copy_to_user(buf, page, got); if (unlikely(!got)) { if (!len) len = -EFAULT; break; } pos += got; buf += got; len += got; count -= got; } free_page((unsigned long)page); return len; } static ssize_t get_task_cmdline(struct task_struct *tsk, char __user *buf, size_t count, loff_t *pos) { struct mm_struct *mm; ssize_t ret; mm = get_task_mm(tsk); if (!mm) return 0; ret = get_mm_cmdline(mm, buf, count, pos); mmput(mm); return ret; } static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { struct task_struct *tsk; ssize_t ret; BUG_ON(*pos < 0); tsk = get_proc_task(file_inode(file)); if (!tsk) return -ESRCH; ret = get_task_cmdline(tsk, buf, count, pos); put_task_struct(tsk); if (ret > 0) *pos += ret; return ret; } static const struct file_operations proc_pid_cmdline_ops = { .read = proc_pid_cmdline_read, .llseek = generic_file_llseek, }; #ifdef CONFIG_KALLSYMS /* * Provides a wchan file via kallsyms in a proper one-value-per-file format. * Returns the resolved symbol. If that fails, simply return the address. */ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { unsigned long wchan; char symname[KSYM_NAME_LEN]; if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) goto print0; wchan = get_wchan(task); if (wchan && !lookup_symbol_name(wchan, symname)) { seq_puts(m, symname); return 0; } print0: seq_putc(m, '0'); return 0; } #endif /* CONFIG_KALLSYMS */ static int lock_trace(struct task_struct *task) { int err = down_read_killable(&task->signal->exec_update_lock); if (err) return err; if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) { up_read(&task->signal->exec_update_lock); return -EPERM; } return 0; } static void unlock_trace(struct task_struct *task) { up_read(&task->signal->exec_update_lock); } #ifdef CONFIG_STACKTRACE #define MAX_STACK_TRACE_DEPTH 64 static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { unsigned long *entries; int err; /* * The ability to racily run the kernel stack unwinder on a running task * and then observe the unwinder output is scary; while it is useful for * debugging kernel issues, it can also allow an attacker to leak kernel * stack contents. * Doing this in a manner that is at least safe from races would require * some work to ensure that the remote task can not be scheduled; and * even then, this would still expose the unwinder as local attack * surface. * Therefore, this interface is restricted to root. */ if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN)) return -EACCES; entries = kmalloc_array(MAX_STACK_TRACE_DEPTH, sizeof(*entries), GFP_KERNEL); if (!entries) return -ENOMEM; err = lock_trace(task); if (!err) { unsigned int i, nr_entries; nr_entries = stack_trace_save_tsk(task, entries, MAX_STACK_TRACE_DEPTH, 0); for (i = 0; i < nr_entries; i++) { seq_printf(m, "[<0>] %pB\n", (void *)entries[i]); } unlock_trace(task); } kfree(entries); return err; } #endif #ifdef CONFIG_SCHED_INFO /* * Provides /proc/PID/schedstat */ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { if (unlikely(!sched_info_on())) seq_puts(m, "0 0 0\n"); else seq_printf(m, "%llu %llu %lu\n", (unsigned long long)task->se.sum_exec_runtime, (unsigned long long)task->sched_info.run_delay, task->sched_info.pcount); return 0; } #endif #ifdef CONFIG_LATENCYTOP static int lstats_show_proc(struct seq_file *m, void *v) { int i; struct inode *inode = m->private; struct task_struct *task = get_proc_task(inode); if (!task) return -ESRCH; seq_puts(m, "Latency Top version : v0.1\n"); for (i = 0; i < LT_SAVECOUNT; i++) { struct latency_record *lr = &task->latency_record[i]; if (lr->backtrace[0]) { int q; seq_printf(m, "%i %li %li", lr->count, lr->time, lr->max); for (q = 0; q < LT_BACKTRACEDEPTH; q++) { unsigned long bt = lr->backtrace[q]; if (!bt) break; seq_printf(m, " %ps", (void *)bt); } seq_putc(m, '\n'); } } put_task_struct(task); return 0; } static int lstats_open(struct inode *inode, struct file *file) { return single_open(file, lstats_show_proc, inode); } static ssize_t lstats_write(struct file *file, const char __user *buf, size_t count, loff_t *offs) { struct task_struct *task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; clear_tsk_latency_tracing(task); put_task_struct(task); return count; } static const struct file_operations proc_lstats_operations = { .open = lstats_open, .read = seq_read, .write = lstats_write, .llseek = seq_lseek, .release = single_release, }; #endif static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { unsigned long totalpages = totalram_pages() + total_swap_pages; unsigned long points = 0; long badness; badness = oom_badness(task, totalpages); /* * Special case OOM_SCORE_ADJ_MIN for all others scale the * badness value into [0, 2000] range which we have been * exporting for a long time so userspace might depend on it. */ if (badness != LONG_MIN) points = (1000 + badness * 1000 / (long)totalpages) * 2 / 3; seq_printf(m, "%lu\n", points); return 0; } struct limit_names { const char *name; const char *unit; }; static const struct limit_names lnames[RLIM_NLIMITS] = { [RLIMIT_CPU] = {"Max cpu time", "seconds"}, [RLIMIT_FSIZE] = {"Max file size", "bytes"}, [RLIMIT_DATA] = {"Max data size", "bytes"}, [RLIMIT_STACK] = {"Max stack size", "bytes"}, [RLIMIT_CORE] = {"Max core file size", "bytes"}, [RLIMIT_RSS] = {"Max resident set", "bytes"}, [RLIMIT_NPROC] = {"Max processes", "processes"}, [RLIMIT_NOFILE] = {"Max open files", "files"}, [RLIMIT_MEMLOCK] = {"Max locked memory", "bytes"}, [RLIMIT_AS] = {"Max address space", "bytes"}, [RLIMIT_LOCKS] = {"Max file locks", "locks"}, [RLIMIT_SIGPENDING] = {"Max pending signals", "signals"}, [RLIMIT_MSGQUEUE] = {"Max msgqueue size", "bytes"}, [RLIMIT_NICE] = {"Max nice priority", NULL}, [RLIMIT_RTPRIO] = {"Max realtime priority", NULL}, [RLIMIT_RTTIME] = {"Max realtime timeout", "us"}, }; /* Display limits for a process */ static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { unsigned int i; unsigned long flags; struct rlimit rlim[RLIM_NLIMITS]; if (!lock_task_sighand(task, &flags)) return 0; memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS); unlock_task_sighand(task, &flags); /* * print the file header */ seq_puts(m, "Limit " "Soft Limit " "Hard Limit " "Units \n"); for (i = 0; i < RLIM_NLIMITS; i++) { if (rlim[i].rlim_cur == RLIM_INFINITY) seq_printf(m, "%-25s %-20s ", lnames[i].name, "unlimited"); else seq_printf(m, "%-25s %-20lu ", lnames[i].name, rlim[i].rlim_cur); if (rlim[i].rlim_max == RLIM_INFINITY) seq_printf(m, "%-20s ", "unlimited"); else seq_printf(m, "%-20lu ", rlim[i].rlim_max); if (lnames[i].unit) seq_printf(m, "%-10s\n", lnames[i].unit); else seq_putc(m, '\n'); } return 0; } #ifdef CONFIG_HAVE_ARCH_TRACEHOOK static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { struct syscall_info info; u64 *args = &info.data.args[0]; int res; res = lock_trace(task); if (res) return res; if (task_current_syscall(task, &info)) seq_puts(m, "running\n"); else if (info.data.nr < 0) seq_printf(m, "%d 0x%llx 0x%llx\n", info.data.nr, info.sp, info.data.instruction_pointer); else seq_printf(m, "%d 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx\n", info.data.nr, args[0], args[1], args[2], args[3], args[4], args[5], info.sp, info.data.instruction_pointer); unlock_trace(task); return 0; } #endif /* CONFIG_HAVE_ARCH_TRACEHOOK */ /************************************************************************/ /* Here the fs part begins */ /************************************************************************/ /* permission checks */ static bool proc_fd_access_allowed(struct inode *inode) { struct task_struct *task; bool allowed = false; /* Allow access to a task's file descriptors if it is us or we * may use ptrace attach to the process and find out that * information. */ task = get_proc_task(inode); if (task) { allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); put_task_struct(task); } return allowed; } int proc_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { int error; struct inode *inode = d_inode(dentry); if (attr->ia_valid & ATTR_MODE) return -EPERM; error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; setattr_copy(&nop_mnt_idmap, inode, attr); return 0; } /* * May current process learn task's sched/cmdline info (for hide_pid_min=1) * or euid/egid (for hide_pid_min=2)? */ static bool has_pid_permissions(struct proc_fs_info *fs_info, struct task_struct *task, enum proc_hidepid hide_pid_min) { /* * If 'hidpid' mount option is set force a ptrace check, * we indicate that we are using a filesystem syscall * by passing PTRACE_MODE_READ_FSCREDS */ if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE) return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); if (fs_info->hide_pid < hide_pid_min) return true; if (in_group_p(fs_info->pid_gid)) return true; return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); } static int proc_pid_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb); struct task_struct *task; bool has_perms; task = get_proc_task(inode); if (!task) return -ESRCH; has_perms = has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS); put_task_struct(task); if (!has_perms) { if (fs_info->hide_pid == HIDEPID_INVISIBLE) { /* * Let's make getdents(), stat(), and open() * consistent with each other. If a process * may not stat() a file, it shouldn't be seen * in procfs at all. */ return -ENOENT; } return -EPERM; } return generic_permission(&nop_mnt_idmap, inode, mask); } static const struct inode_operations proc_def_inode_operations = { .setattr = proc_setattr, }; static int proc_single_show(struct seq_file *m, void *v) { struct inode *inode = m->private; struct pid_namespace *ns = proc_pid_ns(inode->i_sb); struct pid *pid = proc_pid(inode); struct task_struct *task; int ret; task = get_pid_task(pid, PIDTYPE_PID); if (!task) return -ESRCH; ret = PROC_I(inode)->op.proc_show(m, ns, pid, task); put_task_struct(task); return ret; } static int proc_single_open(struct inode *inode, struct file *filp) { return single_open(filp, proc_single_show, inode); } static const struct file_operations proc_single_file_operations = { .open = proc_single_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode) { struct task_struct *task = get_proc_task(inode); struct mm_struct *mm = ERR_PTR(-ESRCH); if (task) { mm = mm_access(task, mode | PTRACE_MODE_FSCREDS); put_task_struct(task); if (!IS_ERR_OR_NULL(mm)) { /* ensure this mm_struct can't be freed */ mmgrab(mm); /* but do not pin its memory */ mmput(mm); } } return mm; } static int __mem_open(struct inode *inode, struct file *file, unsigned int mode) { struct mm_struct *mm = proc_mem_open(inode, mode); if (IS_ERR(mm)) return PTR_ERR(mm); file->private_data = mm; return 0; } static int mem_open(struct inode *inode, struct file *file) { int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH); /* OK to pass negative loff_t, we can catch out-of-range */ file->f_mode |= FMODE_UNSIGNED_OFFSET; return ret; } static ssize_t mem_rw(struct file *file, char __user *buf, size_t count, loff_t *ppos, int write) { struct mm_struct *mm = file->private_data; unsigned long addr = *ppos; ssize_t copied; char *page; unsigned int flags; if (!mm) return 0; page = (char *)__get_free_page(GFP_KERNEL); if (!page) return -ENOMEM; copied = 0; if (!mmget_not_zero(mm)) goto free; flags = FOLL_FORCE | (write ? FOLL_WRITE : 0); while (count > 0) { size_t this_len = min_t(size_t, count, PAGE_SIZE); if (write && copy_from_user(page, buf, this_len)) { copied = -EFAULT; break; } this_len = access_remote_vm(mm, addr, page, this_len, flags); if (!this_len) { if (!copied) copied = -EIO; break; } if (!write && copy_to_user(buf, page, this_len)) { copied = -EFAULT; break; } buf += this_len; addr += this_len; copied += this_len; count -= this_len; } *ppos = addr; mmput(mm); free: free_page((unsigned long) page); return copied; } static ssize_t mem_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { return mem_rw(file, buf, count, ppos, 0); } static ssize_t mem_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { return mem_rw(file, (char __user*)buf, count, ppos, 1); } loff_t mem_lseek(struct file *file, loff_t offset, int orig) { switch (orig) { case 0: file->f_pos = offset; break; case 1: file->f_pos += offset; break; default: return -EINVAL; } force_successful_syscall_return(); return file->f_pos; } static int mem_release(struct inode *inode, struct file *file) { struct mm_struct *mm = file->private_data; if (mm) mmdrop(mm); return 0; } static const struct file_operations proc_mem_operations = { .llseek = mem_lseek, .read = mem_read, .write = mem_write, .open = mem_open, .release = mem_release, }; static int environ_open(struct inode *inode, struct file *file) { return __mem_open(inode, file, PTRACE_MODE_READ); } static ssize_t environ_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { char *page; unsigned long src = *ppos; int ret = 0; struct mm_struct *mm = file->private_data; unsigned long env_start, env_end; /* Ensure the process spawned far enough to have an environment. */ if (!mm || !mm->env_end) return 0; page = (char *)__get_free_page(GFP_KERNEL); if (!page) return -ENOMEM; ret = 0; if (!mmget_not_zero(mm)) goto free; spin_lock(&mm->arg_lock); env_start = mm->env_start; env_end = mm->env_end; spin_unlock(&mm->arg_lock); while (count > 0) { size_t this_len, max_len; int retval; if (src >= (env_end - env_start)) break; this_len = env_end - (env_start + src); max_len = min_t(size_t, PAGE_SIZE, count); this_len = min(max_len, this_len); retval = access_remote_vm(mm, (env_start + src), page, this_len, FOLL_ANON); if (retval <= 0) { ret = retval; break; } if (copy_to_user(buf, page, retval)) { ret = -EFAULT; break; } ret += retval; src += retval; buf += retval; count -= retval; } *ppos = src; mmput(mm); free: free_page((unsigned long) page); return ret; } static const struct file_operations proc_environ_operations = { .open = environ_open, .read = environ_read, .llseek = generic_file_llseek, .release = mem_release, }; static int auxv_open(struct inode *inode, struct file *file) { return __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS); } static ssize_t auxv_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct mm_struct *mm = file->private_data; unsigned int nwords = 0; if (!mm) return 0; do { nwords += 2; } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */ return simple_read_from_buffer(buf, count, ppos, mm->saved_auxv, nwords * sizeof(mm->saved_auxv[0])); } static const struct file_operations proc_auxv_operations = { .open = auxv_open, .read = auxv_read, .llseek = generic_file_llseek, .release = mem_release, }; static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task = get_proc_task(file_inode(file)); char buffer[PROC_NUMBUF]; int oom_adj = OOM_ADJUST_MIN; size_t len; if (!task) return -ESRCH; if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX) oom_adj = OOM_ADJUST_MAX; else oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) / OOM_SCORE_ADJ_MAX; put_task_struct(task); if (oom_adj > OOM_ADJUST_MAX) oom_adj = OOM_ADJUST_MAX; len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj); return simple_read_from_buffer(buf, count, ppos, buffer, len); } static int __set_oom_adj(struct file *file, int oom_adj, bool legacy) { struct mm_struct *mm = NULL; struct task_struct *task; int err = 0; task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; mutex_lock(&oom_adj_mutex); if (legacy) { if (oom_adj < task->signal->oom_score_adj && !capable(CAP_SYS_RESOURCE)) { err = -EACCES; goto err_unlock; } /* * /proc/pid/oom_adj is provided for legacy purposes, ask users to use * /proc/pid/oom_score_adj instead. */ pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", current->comm, task_pid_nr(current), task_pid_nr(task), task_pid_nr(task)); } else { if ((short)oom_adj < task->signal->oom_score_adj_min && !capable(CAP_SYS_RESOURCE)) { err = -EACCES; goto err_unlock; } } /* * Make sure we will check other processes sharing the mm if this is * not vfrok which wants its own oom_score_adj. * pin the mm so it doesn't go away and get reused after task_unlock */ if (!task->vfork_done) { struct task_struct *p = find_lock_task_mm(task); if (p) { if (test_bit(MMF_MULTIPROCESS, &p->mm->flags)) { mm = p->mm; mmgrab(mm); } task_unlock(p); } } task->signal->oom_score_adj = oom_adj; if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE)) task->signal->oom_score_adj_min = (short)oom_adj; trace_oom_score_adj_update(task); if (mm) { struct task_struct *p; rcu_read_lock(); for_each_process(p) { if (same_thread_group(task, p)) continue; /* do not touch kernel threads or the global init */ if (p->flags & PF_KTHREAD || is_global_init(p)) continue; task_lock(p); if (!p->vfork_done && process_shares_mm(p, mm)) { p->signal->oom_score_adj = oom_adj; if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE)) p->signal->oom_score_adj_min = (short)oom_adj; } task_unlock(p); } rcu_read_unlock(); mmdrop(mm); } err_unlock: mutex_unlock(&oom_adj_mutex); put_task_struct(task); return err; } /* * /proc/pid/oom_adj exists solely for backwards compatibility with previous * kernels. The effective policy is defined by oom_score_adj, which has a * different scale: oom_adj grew exponentially and oom_score_adj grows linearly. * Values written to oom_adj are simply mapped linearly to oom_score_adj. * Processes that become oom disabled via oom_adj will still be oom disabled * with this implementation. * * oom_adj cannot be removed since existing userspace binaries use it. */ static ssize_t oom_adj_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { char buffer[PROC_NUMBUF]; int oom_adj; int err; memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) { err = -EFAULT; goto out; } err = kstrtoint(strstrip(buffer), 0, &oom_adj); if (err) goto out; if ((oom_adj < OOM_ADJUST_MIN || oom_adj > OOM_ADJUST_MAX) && oom_adj != OOM_DISABLE) { err = -EINVAL; goto out; } /* * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum * value is always attainable. */ if (oom_adj == OOM_ADJUST_MAX) oom_adj = OOM_SCORE_ADJ_MAX; else oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE; err = __set_oom_adj(file, oom_adj, true); out: return err < 0 ? err : count; } static const struct file_operations proc_oom_adj_operations = { .read = oom_adj_read, .write = oom_adj_write, .llseek = generic_file_llseek, }; static ssize_t oom_score_adj_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task = get_proc_task(file_inode(file)); char buffer[PROC_NUMBUF]; short oom_score_adj = OOM_SCORE_ADJ_MIN; size_t len; if (!task) return -ESRCH; oom_score_adj = task->signal->oom_score_adj; put_task_struct(task); len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj); return simple_read_from_buffer(buf, count, ppos, buffer, len); } static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { char buffer[PROC_NUMBUF]; int oom_score_adj; int err; memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) { err = -EFAULT; goto out; } err = kstrtoint(strstrip(buffer), 0, &oom_score_adj); if (err) goto out; if (oom_score_adj < OOM_SCORE_ADJ_MIN || oom_score_adj > OOM_SCORE_ADJ_MAX) { err = -EINVAL; goto out; } err = __set_oom_adj(file, oom_score_adj, false); out: return err < 0 ? err : count; } static const struct file_operations proc_oom_score_adj_operations = { .read = oom_score_adj_read, .write = oom_score_adj_write, .llseek = default_llseek, }; #ifdef CONFIG_AUDIT #define TMPBUFLEN 11 static ssize_t proc_loginuid_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { struct inode * inode = file_inode(file); struct task_struct *task = get_proc_task(inode); ssize_t length; char tmpbuf[TMPBUFLEN]; if (!task) return -ESRCH; length = scnprintf(tmpbuf, TMPBUFLEN, "%u", from_kuid(file->f_cred->user_ns, audit_get_loginuid(task))); put_task_struct(task); return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); } static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, size_t count, loff_t *ppos) { struct inode * inode = file_inode(file); uid_t loginuid; kuid_t kloginuid; int rv; /* Don't let kthreads write their own loginuid */ if (current->flags & PF_KTHREAD) return -EPERM; rcu_read_lock(); if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) { rcu_read_unlock(); return -EPERM; } rcu_read_unlock(); if (*ppos != 0) { /* No partial writes. */ return -EINVAL; } rv = kstrtou32_from_user(buf, count, 10, &loginuid); if (rv < 0) return rv; /* is userspace tring to explicitly UNSET the loginuid? */ if (loginuid == AUDIT_UID_UNSET) { kloginuid = INVALID_UID; } else { kloginuid = make_kuid(file->f_cred->user_ns, loginuid); if (!uid_valid(kloginuid)) return -EINVAL; } rv = audit_set_loginuid(kloginuid); if (rv < 0) return rv; return count; } static const struct file_operations proc_loginuid_operations = { .read = proc_loginuid_read, .write = proc_loginuid_write, .llseek = generic_file_llseek, }; static ssize_t proc_sessionid_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { struct inode * inode = file_inode(file); struct task_struct *task = get_proc_task(inode); ssize_t length; char tmpbuf[TMPBUFLEN]; if (!task) return -ESRCH; length = scnprintf(tmpbuf, TMPBUFLEN, "%u", audit_get_sessionid(task)); put_task_struct(task); return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); } static const struct file_operations proc_sessionid_operations = { .read = proc_sessionid_read, .llseek = generic_file_llseek, }; #endif #ifdef CONFIG_FAULT_INJECTION static ssize_t proc_fault_inject_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { struct task_struct *task = get_proc_task(file_inode(file)); char buffer[PROC_NUMBUF]; size_t len; int make_it_fail; if (!task) return -ESRCH; make_it_fail = task->make_it_fail; put_task_struct(task); len = snprintf(buffer, sizeof(buffer), "%i\n", make_it_fail); return simple_read_from_buffer(buf, count, ppos, buffer, len); } static ssize_t proc_fault_inject_write(struct file * file, const char __user * buf, size_t count, loff_t *ppos) { struct task_struct *task; char buffer[PROC_NUMBUF]; int make_it_fail; int rv; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; rv = kstrtoint(strstrip(buffer), 0, &make_it_fail); if (rv < 0) return rv; if (make_it_fail < 0 || make_it_fail > 1) return -EINVAL; task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; task->make_it_fail = make_it_fail; put_task_struct(task); return count; } static const struct file_operations proc_fault_inject_operations = { .read = proc_fault_inject_read, .write = proc_fault_inject_write, .llseek = generic_file_llseek, }; static ssize_t proc_fail_nth_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task; int err; unsigned int n; err = kstrtouint_from_user(buf, count, 0, &n); if (err) return err; task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; task->fail_nth = n; put_task_struct(task); return count; } static ssize_t proc_fail_nth_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task; char numbuf[PROC_NUMBUF]; ssize_t len; task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; len = snprintf(numbuf, sizeof(numbuf), "%u\n", task->fail_nth); put_task_struct(task); return simple_read_from_buffer(buf, count, ppos, numbuf, len); } static const struct file_operations proc_fail_nth_operations = { .read = proc_fail_nth_read, .write = proc_fail_nth_write, }; #endif #ifdef CONFIG_SCHED_DEBUG /* * Print out various scheduling related per-task fields: */ static int sched_show(struct seq_file *m, void *v) { struct inode *inode = m->private; struct pid_namespace *ns = proc_pid_ns(inode->i_sb); struct task_struct *p; p = get_proc_task(inode); if (!p) return -ESRCH; proc_sched_show_task(p, ns, m); put_task_struct(p); return 0; } static ssize_t sched_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { struct inode *inode = file_inode(file); struct task_struct *p; p = get_proc_task(inode); if (!p) return -ESRCH; proc_sched_set_task(p); put_task_struct(p); return count; } static int sched_open(struct inode *inode, struct file *filp) { return single_open(filp, sched_show, inode); } static const struct file_operations proc_pid_sched_operations = { .open = sched_open, .read = seq_read, .write = sched_write, .llseek = seq_lseek, .release = single_release, }; #endif #ifdef CONFIG_SCHED_AUTOGROUP /* * Print out autogroup related information: */ static int sched_autogroup_show(struct seq_file *m, void *v) { struct inode *inode = m->private; struct task_struct *p; p = get_proc_task(inode); if (!p) return -ESRCH; proc_sched_autogroup_show_task(p, m); put_task_struct(p); return 0; } static ssize_t sched_autogroup_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { struct inode *inode = file_inode(file); struct task_struct *p; char buffer[PROC_NUMBUF]; int nice; int err; memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; err = kstrtoint(strstrip(buffer), 0, &nice); if (err < 0) return err; p = get_proc_task(inode); if (!p) return -ESRCH; err = proc_sched_autogroup_set_nice(p, nice); if (err) count = err; put_task_struct(p); return count; } static int sched_autogroup_open(struct inode *inode, struct file *filp) { int ret; ret = single_open(filp, sched_autogroup_show, NULL); if (!ret) { struct seq_file *m = filp->private_data; m->private = inode; } return ret; } static const struct file_operations proc_pid_sched_autogroup_operations = { .open = sched_autogroup_open, .read = seq_read, .write = sched_autogroup_write, .llseek = seq_lseek, .release = single_release, }; #endif /* CONFIG_SCHED_AUTOGROUP */ #ifdef CONFIG_TIME_NS static int timens_offsets_show(struct seq_file *m, void *v) { struct task_struct *p; p = get_proc_task(file_inode(m->file)); if (!p) return -ESRCH; proc_timens_show_offsets(p, m); put_task_struct(p); return 0; } static ssize_t timens_offsets_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct inode *inode = file_inode(file); struct proc_timens_offset offsets[2]; char *kbuf = NULL, *pos, *next_line; struct task_struct *p; int ret, noffsets; /* Only allow < page size writes at the beginning of the file */ if ((*ppos != 0) || (count >= PAGE_SIZE)) return -EINVAL; /* Slurp in the user data */ kbuf = memdup_user_nul(buf, count); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); /* Parse the user data */ ret = -EINVAL; noffsets = 0; for (pos = kbuf; pos; pos = next_line) { struct proc_timens_offset *off = &offsets[noffsets]; char clock[10]; int err; /* Find the end of line and ensure we don't look past it */ next_line = strchr(pos, '\n'); if (next_line) { *next_line = '\0'; next_line++; if (*next_line == '\0') next_line = NULL; } err = sscanf(pos, "%9s %lld %lu", clock, &off->val.tv_sec, &off->val.tv_nsec); if (err != 3 || off->val.tv_nsec >= NSEC_PER_SEC) goto out; clock[sizeof(clock) - 1] = 0; if (strcmp(clock, "monotonic") == 0 || strcmp(clock, __stringify(CLOCK_MONOTONIC)) == 0) off->clockid = CLOCK_MONOTONIC; else if (strcmp(clock, "boottime") == 0 || strcmp(clock, __stringify(CLOCK_BOOTTIME)) == 0) off->clockid = CLOCK_BOOTTIME; else goto out; noffsets++; if (noffsets == ARRAY_SIZE(offsets)) { if (next_line) count = next_line - kbuf; break; } } ret = -ESRCH; p = get_proc_task(inode); if (!p) goto out; ret = proc_timens_set_offset(file, p, offsets, noffsets); put_task_struct(p); if (ret) goto out; ret = count; out: kfree(kbuf); return ret; } static int timens_offsets_open(struct inode *inode, struct file *filp) { return single_open(filp, timens_offsets_show, inode); } static const struct file_operations proc_timens_offsets_operations = { .open = timens_offsets_open, .read = seq_read, .write = timens_offsets_write, .llseek = seq_lseek, .release = single_release, }; #endif /* CONFIG_TIME_NS */ static ssize_t comm_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { struct inode *inode = file_inode(file); struct task_struct *p; char buffer[TASK_COMM_LEN]; const size_t maxlen = sizeof(buffer) - 1; memset(buffer, 0, sizeof(buffer)); if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count)) return -EFAULT; p = get_proc_task(inode); if (!p) return -ESRCH; if (same_thread_group(current, p)) { set_task_comm(p, buffer); proc_comm_connector(p); } else count = -EINVAL; put_task_struct(p); return count; } static int comm_show(struct seq_file *m, void *v) { struct inode *inode = m->private; struct task_struct *p; p = get_proc_task(inode); if (!p) return -ESRCH; proc_task_name(m, p, false); seq_putc(m, '\n'); put_task_struct(p); return 0; } static int comm_open(struct inode *inode, struct file *filp) { return single_open(filp, comm_show, inode); } static const struct file_operations proc_pid_set_comm_operations = { .open = comm_open, .read = seq_read, .write = comm_write, .llseek = seq_lseek, .release = single_release, }; static int proc_exe_link(struct dentry *dentry, struct path *exe_path) { struct task_struct *task; struct file *exe_file; task = get_proc_task(d_inode(dentry)); if (!task) return -ENOENT; exe_file = get_task_exe_file(task); put_task_struct(task); if (exe_file) { *exe_path = exe_file->f_path; path_get(&exe_file->f_path); fput(exe_file); return 0; } else return -ENOENT; } static const char *proc_pid_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { struct path path; int error = -EACCES; if (!dentry) return ERR_PTR(-ECHILD); /* Are we allowed to snoop on the tasks file descriptors? */ if (!proc_fd_access_allowed(inode)) goto out; error = PROC_I(inode)->op.proc_get_link(dentry, &path); if (error) goto out; error = nd_jump_link(&path); out: return ERR_PTR(error); } static int do_proc_readlink(const struct path *path, char __user *buffer, int buflen) { char *tmp = kmalloc(PATH_MAX, GFP_KERNEL); char *pathname; int len; if (!tmp) return -ENOMEM; pathname = d_path(path, tmp, PATH_MAX); len = PTR_ERR(pathname); if (IS_ERR(pathname)) goto out; len = tmp + PATH_MAX - 1 - pathname; if (len > buflen) len = buflen; if (copy_to_user(buffer, pathname, len)) len = -EFAULT; out: kfree(tmp); return len; } static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen) { int error = -EACCES; struct inode *inode = d_inode(dentry); struct path path; /* Are we allowed to snoop on the tasks file descriptors? */ if (!proc_fd_access_allowed(inode)) goto out; error = PROC_I(inode)->op.proc_get_link(dentry, &path); if (error) goto out; error = do_proc_readlink(&path, buffer, buflen); path_put(&path); out: return error; } const struct inode_operations proc_pid_link_inode_operations = { .readlink = proc_pid_readlink, .get_link = proc_pid_get_link, .setattr = proc_setattr, }; /* building an inode */ void task_dump_owner(struct task_struct *task, umode_t mode, kuid_t *ruid, kgid_t *rgid) { /* Depending on the state of dumpable compute who should own a * proc file for a task. */ const struct cred *cred; kuid_t uid; kgid_t gid; if (unlikely(task->flags & PF_KTHREAD)) { *ruid = GLOBAL_ROOT_UID; *rgid = GLOBAL_ROOT_GID; return; } /* Default to the tasks effective ownership */ rcu_read_lock(); cred = __task_cred(task); uid = cred->euid; gid = cred->egid; rcu_read_unlock(); /* * Before the /proc/pid/status file was created the only way to read * the effective uid of a /process was to stat /proc/pid. Reading * /proc/pid/status is slow enough that procps and other packages * kept stating /proc/pid. To keep the rules in /proc simple I have * made this apply to all per process world readable and executable * directories. */ if (mode != (S_IFDIR|S_IRUGO|S_IXUGO)) { struct mm_struct *mm; task_lock(task); mm = task->mm; /* Make non-dumpable tasks owned by some root */ if (mm) { if (get_dumpable(mm) != SUID_DUMP_USER) { struct user_namespace *user_ns = mm->user_ns; uid = make_kuid(user_ns, 0); if (!uid_valid(uid)) uid = GLOBAL_ROOT_UID; gid = make_kgid(user_ns, 0); if (!gid_valid(gid)) gid = GLOBAL_ROOT_GID; } } else { uid = GLOBAL_ROOT_UID; gid = GLOBAL_ROOT_GID; } task_unlock(task); } *ruid = uid; *rgid = gid; } void proc_pid_evict_inode(struct proc_inode *ei) { struct pid *pid = ei->pid; if (S_ISDIR(ei->vfs_inode.i_mode)) { spin_lock(&pid->lock); hlist_del_init_rcu(&ei->sibling_inodes); spin_unlock(&pid->lock); } put_pid(pid); } struct inode *proc_pid_make_inode(struct super_block *sb, struct task_struct *task, umode_t mode) { struct inode * inode; struct proc_inode *ei; struct pid *pid; /* We need a new inode */ inode = new_inode(sb); if (!inode) goto out; /* Common stuff */ ei = PROC_I(inode); inode->i_mode = mode; inode->i_ino = get_next_ino(); inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); inode->i_op = &proc_def_inode_operations; /* * grab the reference to task. */ pid = get_task_pid(task, PIDTYPE_PID); if (!pid) goto out_unlock; /* Let the pid remember us for quick removal */ ei->pid = pid; task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid); security_task_to_inode(task, inode); out: return inode; out_unlock: iput(inode); return NULL; } /* * Generating an inode and adding it into @pid->inodes, so that task will * invalidate inode's dentry before being released. * * This helper is used for creating dir-type entries under '/proc' and * '/proc/<tgid>/task'. Other entries(eg. fd, stat) under '/proc/<tgid>' * can be released by invalidating '/proc/<tgid>' dentry. * In theory, dentries under '/proc/<tgid>/task' can also be released by * invalidating '/proc/<tgid>' dentry, we reserve it to handle single * thread exiting situation: Any one of threads should invalidate its * '/proc/<tgid>/task/<pid>' dentry before released. */ static struct inode *proc_pid_make_base_inode(struct super_block *sb, struct task_struct *task, umode_t mode) { struct inode *inode; struct proc_inode *ei; struct pid *pid; inode = proc_pid_make_inode(sb, task, mode); if (!inode) return NULL; /* Let proc_flush_pid find this directory inode */ ei = PROC_I(inode); pid = ei->pid; spin_lock(&pid->lock); hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes); spin_unlock(&pid->lock); return inode; } int pid_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb); struct task_struct *task; generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); stat->uid = GLOBAL_ROOT_UID; stat->gid = GLOBAL_ROOT_GID; rcu_read_lock(); task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) { if (!has_pid_permissions(fs_info, task, HIDEPID_INVISIBLE)) { rcu_read_unlock(); /* * This doesn't prevent learning whether PID exists, * it only makes getattr() consistent with readdir(). */ return -ENOENT; } task_dump_owner(task, inode->i_mode, &stat->uid, &stat->gid); } rcu_read_unlock(); return 0; } /* dentry stuff */ /* * Set <pid>/... inode ownership (can change due to setuid(), etc.) */ void pid_update_inode(struct task_struct *task, struct inode *inode) { task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid); inode->i_mode &= ~(S_ISUID | S_ISGID); security_task_to_inode(task, inode); } /* * Rewrite the inode's ownerships here because the owning task may have * performed a setuid(), etc. * */ static int pid_revalidate(struct dentry *dentry, unsigned int flags) { struct inode *inode; struct task_struct *task; int ret = 0; rcu_read_lock(); inode = d_inode_rcu(dentry); if (!inode) goto out; task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) { pid_update_inode(task, inode); ret = 1; } out: rcu_read_unlock(); return ret; } static inline bool proc_inode_is_dead(struct inode *inode) { return !proc_pid(inode)->tasks[PIDTYPE_PID].first; } int pid_delete_dentry(const struct dentry *dentry) { /* Is the task we represent dead? * If so, then don't put the dentry on the lru list, * kill it immediately. */ return proc_inode_is_dead(d_inode(dentry)); } const struct dentry_operations pid_dentry_operations = { .d_revalidate = pid_revalidate, .d_delete = pid_delete_dentry, }; /* Lookups */ /* * Fill a directory entry. * * If possible create the dcache entry and derive our inode number and * file type from dcache entry. * * Since all of the proc inode numbers are dynamically generated, the inode * numbers do not exist until the inode is cache. This means creating * the dcache entry in readdir is necessary to keep the inode numbers * reported by readdir in sync with the inode numbers reported * by stat. */ bool proc_fill_cache(struct file *file, struct dir_context *ctx, const char *name, unsigned int len, instantiate_t instantiate, struct task_struct *task, const void *ptr) { struct dentry *child, *dir = file->f_path.dentry; struct qstr qname = QSTR_INIT(name, len); struct inode *inode; unsigned type = DT_UNKNOWN; ino_t ino = 1; child = d_hash_and_lookup(dir, &qname); if (!child) { DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); child = d_alloc_parallel(dir, &qname, &wq); if (IS_ERR(child)) goto end_instantiate; if (d_in_lookup(child)) { struct dentry *res; res = instantiate(child, task, ptr); d_lookup_done(child); if (unlikely(res)) { dput(child); child = res; if (IS_ERR(child)) goto end_instantiate; } } } inode = d_inode(child); ino = inode->i_ino; type = inode->i_mode >> 12; dput(child); end_instantiate: return dir_emit(ctx, name, len, ino, type); } /* * dname_to_vma_addr - maps a dentry name into two unsigned longs * which represent vma start and end addresses. */ static int dname_to_vma_addr(struct dentry *dentry, unsigned long *start, unsigned long *end) { const char *str = dentry->d_name.name; unsigned long long sval, eval; unsigned int len; if (str[0] == '0' && str[1] != '-') return -EINVAL; len = _parse_integer(str, 16, &sval); if (len & KSTRTOX_OVERFLOW) return -EINVAL; if (sval != (unsigned long)sval) return -EINVAL; str += len; if (*str != '-') return -EINVAL; str++; if (str[0] == '0' && str[1]) return -EINVAL; len = _parse_integer(str, 16, &eval); if (len & KSTRTOX_OVERFLOW) return -EINVAL; if (eval != (unsigned long)eval) return -EINVAL; str += len; if (*str != '\0') return -EINVAL; *start = sval; *end = eval; return 0; } static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags) { unsigned long vm_start, vm_end; bool exact_vma_exists = false; struct mm_struct *mm = NULL; struct task_struct *task; struct inode *inode; int status = 0; if (flags & LOOKUP_RCU) return -ECHILD; inode = d_inode(dentry); task = get_proc_task(inode); if (!task) goto out_notask; mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); if (IS_ERR_OR_NULL(mm)) goto out; if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) { status = mmap_read_lock_killable(mm); if (!status) { exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end); mmap_read_unlock(mm); } } mmput(mm); if (exact_vma_exists) { task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid); security_task_to_inode(task, inode); status = 1; } out: put_task_struct(task); out_notask: return status; } static const struct dentry_operations tid_map_files_dentry_operations = { .d_revalidate = map_files_d_revalidate, .d_delete = pid_delete_dentry, }; static int map_files_get_link(struct dentry *dentry, struct path *path) { unsigned long vm_start, vm_end; struct vm_area_struct *vma; struct task_struct *task; struct mm_struct *mm; int rc; rc = -ENOENT; task = get_proc_task(d_inode(dentry)); if (!task) goto out; mm = get_task_mm(task); put_task_struct(task); if (!mm) goto out; rc = dname_to_vma_addr(dentry, &vm_start, &vm_end); if (rc) goto out_mmput; rc = mmap_read_lock_killable(mm); if (rc) goto out_mmput; rc = -ENOENT; vma = find_exact_vma(mm, vm_start, vm_end); if (vma && vma->vm_file) { *path = vma->vm_file->f_path; path_get(path); rc = 0; } mmap_read_unlock(mm); out_mmput: mmput(mm); out: return rc; } struct map_files_info { unsigned long start; unsigned long end; fmode_t mode; }; /* * Only allow CAP_SYS_ADMIN and CAP_CHECKPOINT_RESTORE to follow the links, due * to concerns about how the symlinks may be used to bypass permissions on * ancestor directories in the path to the file in question. */ static const char * proc_map_files_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { if (!checkpoint_restore_ns_capable(&init_user_ns)) return ERR_PTR(-EPERM); return proc_pid_get_link(dentry, inode, done); } /* * Identical to proc_pid_link_inode_operations except for get_link() */ static const struct inode_operations proc_map_files_link_inode_operations = { .readlink = proc_pid_readlink, .get_link = proc_map_files_get_link, .setattr = proc_setattr, }; static struct dentry * proc_map_files_instantiate(struct dentry *dentry, struct task_struct *task, const void *ptr) { fmode_t mode = (fmode_t)(unsigned long)ptr; struct proc_inode *ei; struct inode *inode; inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK | ((mode & FMODE_READ ) ? S_IRUSR : 0) | ((mode & FMODE_WRITE) ? S_IWUSR : 0)); if (!inode) return ERR_PTR(-ENOENT); ei = PROC_I(inode); ei->op.proc_get_link = map_files_get_link; inode->i_op = &proc_map_files_link_inode_operations; inode->i_size = 64; d_set_d_op(dentry, &tid_map_files_dentry_operations); return d_splice_alias(inode, dentry); } static struct dentry *proc_map_files_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { unsigned long vm_start, vm_end; struct vm_area_struct *vma; struct task_struct *task; struct dentry *result; struct mm_struct *mm; result = ERR_PTR(-ENOENT); task = get_proc_task(dir); if (!task) goto out; result = ERR_PTR(-EACCES); if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) goto out_put_task; result = ERR_PTR(-ENOENT); if (dname_to_vma_addr(dentry, &vm_start, &vm_end)) goto out_put_task; mm = get_task_mm(task); if (!mm) goto out_put_task; result = ERR_PTR(-EINTR); if (mmap_read_lock_killable(mm)) goto out_put_mm; result = ERR_PTR(-ENOENT); vma = find_exact_vma(mm, vm_start, vm_end); if (!vma) goto out_no_vma; if (vma->vm_file) result = proc_map_files_instantiate(dentry, task, (void *)(unsigned long)vma->vm_file->f_mode); out_no_vma: mmap_read_unlock(mm); out_put_mm: mmput(mm); out_put_task: put_task_struct(task); out: return result; } static const struct inode_operations proc_map_files_inode_operations = { .lookup = proc_map_files_lookup, .permission = proc_fd_permission, .setattr = proc_setattr, }; static int proc_map_files_readdir(struct file *file, struct dir_context *ctx) { struct vm_area_struct *vma; struct task_struct *task; struct mm_struct *mm; unsigned long nr_files, pos, i; GENRADIX(struct map_files_info) fa; struct map_files_info *p; int ret; struct vma_iterator vmi; genradix_init(&fa); ret = -ENOENT; task = get_proc_task(file_inode(file)); if (!task) goto out; ret = -EACCES; if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) goto out_put_task; ret = 0; if (!dir_emit_dots(file, ctx)) goto out_put_task; mm = get_task_mm(task); if (!mm) goto out_put_task; ret = mmap_read_lock_killable(mm); if (ret) { mmput(mm); goto out_put_task; } nr_files = 0; /* * We need two passes here: * * 1) Collect vmas of mapped files with mmap_lock taken * 2) Release mmap_lock and instantiate entries * * otherwise we get lockdep complained, since filldir() * routine might require mmap_lock taken in might_fault(). */ pos = 2; vma_iter_init(&vmi, mm, 0); for_each_vma(vmi, vma) { if (!vma->vm_file) continue; if (++pos <= ctx->pos) continue; p = genradix_ptr_alloc(&fa, nr_files++, GFP_KERNEL); if (!p) { ret = -ENOMEM; mmap_read_unlock(mm); mmput(mm); goto out_put_task; } p->start = vma->vm_start; p->end = vma->vm_end; p->mode = vma->vm_file->f_mode; } mmap_read_unlock(mm); mmput(mm); for (i = 0; i < nr_files; i++) { char buf[4 * sizeof(long) + 2]; /* max: %lx-%lx\0 */ unsigned int len; p = genradix_ptr(&fa, i); len = snprintf(buf, sizeof(buf), "%lx-%lx", p->start, p->end); if (!proc_fill_cache(file, ctx, buf, len, proc_map_files_instantiate, task, (void *)(unsigned long)p->mode)) break; ctx->pos++; } out_put_task: put_task_struct(task); out: genradix_free(&fa); return ret; } static const struct file_operations proc_map_files_operations = { .read = generic_read_dir, .iterate_shared = proc_map_files_readdir, .llseek = generic_file_llseek, }; #if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS) struct timers_private { struct pid *pid; struct task_struct *task; struct sighand_struct *sighand; struct pid_namespace *ns; unsigned long flags; }; static void *timers_start(struct seq_file *m, loff_t *pos) { struct timers_private *tp = m->private; tp->task = get_pid_task(tp->pid, PIDTYPE_PID); if (!tp->task) return ERR_PTR(-ESRCH); tp->sighand = lock_task_sighand(tp->task, &tp->flags); if (!tp->sighand) return ERR_PTR(-ESRCH); return seq_list_start(&tp->task->signal->posix_timers, *pos); } static void *timers_next(struct seq_file *m, void *v, loff_t *pos) { struct timers_private *tp = m->private; return seq_list_next(v, &tp->task->signal->posix_timers, pos); } static void timers_stop(struct seq_file *m, void *v) { struct timers_private *tp = m->private; if (tp->sighand) { unlock_task_sighand(tp->task, &tp->flags); tp->sighand = NULL; } if (tp->task) { put_task_struct(tp->task); tp->task = NULL; } } static int show_timer(struct seq_file *m, void *v) { struct k_itimer *timer; struct timers_private *tp = m->private; int notify; static const char * const nstr[] = { [SIGEV_SIGNAL] = "signal", [SIGEV_NONE] = "none", [SIGEV_THREAD] = "thread", }; timer = list_entry((struct list_head *)v, struct k_itimer, list); notify = timer->it_sigev_notify; seq_printf(m, "ID: %d\n", timer->it_id); seq_printf(m, "signal: %d/%px\n", timer->sigq->info.si_signo, timer->sigq->info.si_value.sival_ptr); seq_printf(m, "notify: %s/%s.%d\n", nstr[notify & ~SIGEV_THREAD_ID], (notify & SIGEV_THREAD_ID) ? "tid" : "pid", pid_nr_ns(timer->it_pid, tp->ns)); seq_printf(m, "ClockID: %d\n", timer->it_clock); return 0; } static const struct seq_operations proc_timers_seq_ops = { .start = timers_start, .next = timers_next, .stop = timers_stop, .show = show_timer, }; static int proc_timers_open(struct inode *inode, struct file *file) { struct timers_private *tp; tp = __seq_open_private(file, &proc_timers_seq_ops, sizeof(struct timers_private)); if (!tp) return -ENOMEM; tp->pid = proc_pid(inode); tp->ns = proc_pid_ns(inode->i_sb); return 0; } static const struct file_operations proc_timers_operations = { .open = proc_timers_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release_private, }; #endif static ssize_t timerslack_ns_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { struct inode *inode = file_inode(file); struct task_struct *p; u64 slack_ns; int err; err = kstrtoull_from_user(buf, count, 10, &slack_ns); if (err < 0) return err; p = get_proc_task(inode); if (!p) return -ESRCH; if (p != current) { rcu_read_lock(); if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { rcu_read_unlock(); count = -EPERM; goto out; } rcu_read_unlock(); err = security_task_setscheduler(p); if (err) { count = err; goto out; } } task_lock(p); if (slack_ns == 0) p->timer_slack_ns = p->default_timer_slack_ns; else p->timer_slack_ns = slack_ns; task_unlock(p); out: put_task_struct(p); return count; } static int timerslack_ns_show(struct seq_file *m, void *v) { struct inode *inode = m->private; struct task_struct *p; int err = 0; p = get_proc_task(inode); if (!p) return -ESRCH; if (p != current) { rcu_read_lock(); if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { rcu_read_unlock(); err = -EPERM; goto out; } rcu_read_unlock(); err = security_task_getscheduler(p); if (err) goto out; } task_lock(p); seq_printf(m, "%llu\n", p->timer_slack_ns); task_unlock(p); out: put_task_struct(p); return err; } static int timerslack_ns_open(struct inode *inode, struct file *filp) { return single_open(filp, timerslack_ns_show, inode); } static const struct file_operations proc_pid_set_timerslack_ns_operations = { .open = timerslack_ns_open, .read = seq_read, .write = timerslack_ns_write, .llseek = seq_lseek, .release = single_release, }; static struct dentry *proc_pident_instantiate(struct dentry *dentry, struct task_struct *task, const void *ptr) { const struct pid_entry *p = ptr; struct inode *inode; struct proc_inode *ei; inode = proc_pid_make_inode(dentry->d_sb, task, p->mode); if (!inode) return ERR_PTR(-ENOENT); ei = PROC_I(inode); if (S_ISDIR(inode->i_mode)) set_nlink(inode, 2); /* Use getattr to fix if necessary */ if (p->iop) inode->i_op = p->iop; if (p->fop) inode->i_fop = p->fop; ei->op = p->op; pid_update_inode(task, inode); d_set_d_op(dentry, &pid_dentry_operations); return d_splice_alias(inode, dentry); } static struct dentry *proc_pident_lookup(struct inode *dir, struct dentry *dentry, const struct pid_entry *p, const struct pid_entry *end) { struct task_struct *task = get_proc_task(dir); struct dentry *res = ERR_PTR(-ENOENT); if (!task) goto out_no_task; /* * Yes, it does not scale. And it should not. Don't add * new entries into /proc/<tgid>/ without very good reasons. */ for (; p < end; p++) { if (p->len != dentry->d_name.len) continue; if (!memcmp(dentry->d_name.name, p->name, p->len)) { res = proc_pident_instantiate(dentry, task, p); break; } } put_task_struct(task); out_no_task: return res; } static int proc_pident_readdir(struct file *file, struct dir_context *ctx, const struct pid_entry *ents, unsigned int nents) { struct task_struct *task = get_proc_task(file_inode(file)); const struct pid_entry *p; if (!task) return -ENOENT; if (!dir_emit_dots(file, ctx)) goto out; if (ctx->pos >= nents + 2) goto out; for (p = ents + (ctx->pos - 2); p < ents + nents; p++) { if (!proc_fill_cache(file, ctx, p->name, p->len, proc_pident_instantiate, task, p)) break; ctx->pos++; } out: put_task_struct(task); return 0; } #ifdef CONFIG_SECURITY static int proc_pid_attr_open(struct inode *inode, struct file *file) { file->private_data = NULL; __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS); return 0; } static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { struct inode * inode = file_inode(file); char *p = NULL; ssize_t length; struct task_struct *task = get_proc_task(inode); if (!task) return -ESRCH; length = security_getprocattr(task, PROC_I(inode)->op.lsm, file->f_path.dentry->d_name.name, &p); put_task_struct(task); if (length > 0) length = simple_read_from_buffer(buf, count, ppos, p, length); kfree(p); return length; } static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, size_t count, loff_t *ppos) { struct inode * inode = file_inode(file); struct task_struct *task; void *page; int rv; /* A task may only write when it was the opener. */ if (file->private_data != current->mm) return -EPERM; rcu_read_lock(); task = pid_task(proc_pid(inode), PIDTYPE_PID); if (!task) { rcu_read_unlock(); return -ESRCH; } /* A task may only write its own attributes. */ if (current != task) { rcu_read_unlock(); return -EACCES; } /* Prevent changes to overridden credentials. */ if (current_cred() != current_real_cred()) { rcu_read_unlock(); return -EBUSY; } rcu_read_unlock(); if (count > PAGE_SIZE) count = PAGE_SIZE; /* No partial writes. */ if (*ppos != 0) return -EINVAL; page = memdup_user(buf, count); if (IS_ERR(page)) { rv = PTR_ERR(page); goto out; } /* Guard against adverse ptrace interaction */ rv = mutex_lock_interruptible(¤t->signal->cred_guard_mutex); if (rv < 0) goto out_free; rv = security_setprocattr(PROC_I(inode)->op.lsm, file->f_path.dentry->d_name.name, page, count); mutex_unlock(¤t->signal->cred_guard_mutex); out_free: kfree(page); out: return rv; } static const struct file_operations proc_pid_attr_operations = { .open = proc_pid_attr_open, .read = proc_pid_attr_read, .write = proc_pid_attr_write, .llseek = generic_file_llseek, .release = mem_release, }; #define LSM_DIR_OPS(LSM) \ static int proc_##LSM##_attr_dir_iterate(struct file *filp, \ struct dir_context *ctx) \ { \ return proc_pident_readdir(filp, ctx, \ LSM##_attr_dir_stuff, \ ARRAY_SIZE(LSM##_attr_dir_stuff)); \ } \ \ static const struct file_operations proc_##LSM##_attr_dir_ops = { \ .read = generic_read_dir, \ .iterate_shared = proc_##LSM##_attr_dir_iterate, \ .llseek = default_llseek, \ }; \ \ static struct dentry *proc_##LSM##_attr_dir_lookup(struct inode *dir, \ struct dentry *dentry, unsigned int flags) \ { \ return proc_pident_lookup(dir, dentry, \ LSM##_attr_dir_stuff, \ LSM##_attr_dir_stuff + ARRAY_SIZE(LSM##_attr_dir_stuff)); \ } \ \ static const struct inode_operations proc_##LSM##_attr_dir_inode_ops = { \ .lookup = proc_##LSM##_attr_dir_lookup, \ .getattr = pid_getattr, \ .setattr = proc_setattr, \ } #ifdef CONFIG_SECURITY_SMACK static const struct pid_entry smack_attr_dir_stuff[] = { ATTR("smack", "current", 0666), }; LSM_DIR_OPS(smack); #endif #ifdef CONFIG_SECURITY_APPARMOR static const struct pid_entry apparmor_attr_dir_stuff[] = { ATTR("apparmor", "current", 0666), ATTR("apparmor", "prev", 0444), ATTR("apparmor", "exec", 0666), }; LSM_DIR_OPS(apparmor); #endif static const struct pid_entry attr_dir_stuff[] = { ATTR(NULL, "current", 0666), ATTR(NULL, "prev", 0444), ATTR(NULL, "exec", 0666), ATTR(NULL, "fscreate", 0666), ATTR(NULL, "keycreate", 0666), ATTR(NULL, "sockcreate", 0666), #ifdef CONFIG_SECURITY_SMACK DIR("smack", 0555, proc_smack_attr_dir_inode_ops, proc_smack_attr_dir_ops), #endif #ifdef CONFIG_SECURITY_APPARMOR DIR("apparmor", 0555, proc_apparmor_attr_dir_inode_ops, proc_apparmor_attr_dir_ops), #endif }; static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx) { return proc_pident_readdir(file, ctx, attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff)); } static const struct file_operations proc_attr_dir_operations = { .read = generic_read_dir, .iterate_shared = proc_attr_dir_readdir, .llseek = generic_file_llseek, }; static struct dentry *proc_attr_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return proc_pident_lookup(dir, dentry, attr_dir_stuff, attr_dir_stuff + ARRAY_SIZE(attr_dir_stuff)); } static const struct inode_operations proc_attr_dir_inode_operations = { .lookup = proc_attr_dir_lookup, .getattr = pid_getattr, .setattr = proc_setattr, }; #endif #ifdef CONFIG_ELF_CORE static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task = get_proc_task(file_inode(file)); struct mm_struct *mm; char buffer[PROC_NUMBUF]; size_t len; int ret; if (!task) return -ESRCH; ret = 0; mm = get_task_mm(task); if (mm) { len = snprintf(buffer, sizeof(buffer), "%08lx\n", ((mm->flags & MMF_DUMP_FILTER_MASK) >> MMF_DUMP_FILTER_SHIFT)); mmput(mm); ret = simple_read_from_buffer(buf, count, ppos, buffer, len); } put_task_struct(task); return ret; } static ssize_t proc_coredump_filter_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task; struct mm_struct *mm; unsigned int val; int ret; int i; unsigned long mask; ret = kstrtouint_from_user(buf, count, 0, &val); if (ret < 0) return ret; ret = -ESRCH; task = get_proc_task(file_inode(file)); if (!task) goto out_no_task; mm = get_task_mm(task); if (!mm) goto out_no_mm; ret = 0; for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) { if (val & mask) set_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags); else clear_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags); } mmput(mm); out_no_mm: put_task_struct(task); out_no_task: if (ret < 0) return ret; return count; } static const struct file_operations proc_coredump_filter_operations = { .read = proc_coredump_filter_read, .write = proc_coredump_filter_write, .llseek = generic_file_llseek, }; #endif #ifdef CONFIG_TASK_IO_ACCOUNTING static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole) { struct task_io_accounting acct = task->ioac; unsigned long flags; int result; result = down_read_killable(&task->signal->exec_update_lock); if (result) return result; if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { result = -EACCES; goto out_unlock; } if (whole && lock_task_sighand(task, &flags)) { struct task_struct *t = task; task_io_accounting_add(&acct, &task->signal->ioac); while_each_thread(task, t) task_io_accounting_add(&acct, &t->ioac); unlock_task_sighand(task, &flags); } seq_printf(m, "rchar: %llu\n" "wchar: %llu\n" "syscr: %llu\n" "syscw: %llu\n" "read_bytes: %llu\n" "write_bytes: %llu\n" "cancelled_write_bytes: %llu\n", (unsigned long long)acct.rchar, (unsigned long long)acct.wchar, (unsigned long long)acct.syscr, (unsigned long long)acct.syscw, (unsigned long long)acct.read_bytes, (unsigned long long)acct.write_bytes, (unsigned long long)acct.cancelled_write_bytes); result = 0; out_unlock: up_read(&task->signal->exec_update_lock); return result; } static int proc_tid_io_accounting(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { return do_io_accounting(task, m, 0); } static int proc_tgid_io_accounting(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { return do_io_accounting(task, m, 1); } #endif /* CONFIG_TASK_IO_ACCOUNTING */ #ifdef CONFIG_USER_NS static int proc_id_map_open(struct inode *inode, struct file *file, const struct seq_operations *seq_ops) { struct user_namespace *ns = NULL; struct task_struct *task; struct seq_file *seq; int ret = -EINVAL; task = get_proc_task(inode); if (task) { rcu_read_lock(); ns = get_user_ns(task_cred_xxx(task, user_ns)); rcu_read_unlock(); put_task_struct(task); } if (!ns) goto err; ret = seq_open(file, seq_ops); if (ret) goto err_put_ns; seq = file->private_data; seq->private = ns; return 0; err_put_ns: put_user_ns(ns); err: return ret; } static int proc_id_map_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; put_user_ns(ns); return seq_release(inode, file); } static int proc_uid_map_open(struct inode *inode, struct file *file) { return proc_id_map_open(inode, file, &proc_uid_seq_operations); } static int proc_gid_map_open(struct inode *inode, struct file *file) { return proc_id_map_open(inode, file, &proc_gid_seq_operations); } static int proc_projid_map_open(struct inode *inode, struct file *file) { return proc_id_map_open(inode, file, &proc_projid_seq_operations); } static const struct file_operations proc_uid_map_operations = { .open = proc_uid_map_open, .write = proc_uid_map_write, .read = seq_read, .llseek = seq_lseek, .release = proc_id_map_release, }; static const struct file_operations proc_gid_map_operations = { .open = proc_gid_map_open, .write = proc_gid_map_write, .read = seq_read, .llseek = seq_lseek, .release = proc_id_map_release, }; static const struct file_operations proc_projid_map_operations = { .open = proc_projid_map_open, .write = proc_projid_map_write, .read = seq_read, .llseek = seq_lseek, .release = proc_id_map_release, }; static int proc_setgroups_open(struct inode *inode, struct file *file) { struct user_namespace *ns = NULL; struct task_struct *task; int ret; ret = -ESRCH; task = get_proc_task(inode); if (task) { rcu_read_lock(); ns = get_user_ns(task_cred_xxx(task, user_ns)); rcu_read_unlock(); put_task_struct(task); } if (!ns) goto err; if (file->f_mode & FMODE_WRITE) { ret = -EACCES; if (!ns_capable(ns, CAP_SYS_ADMIN)) goto err_put_ns; } ret = single_open(file, &proc_setgroups_show, ns); if (ret) goto err_put_ns; return 0; err_put_ns: put_user_ns(ns); err: return ret; } static int proc_setgroups_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; int ret = single_release(inode, file); put_user_ns(ns); return ret; } static const struct file_operations proc_setgroups_operations = { .open = proc_setgroups_open, .write = proc_setgroups_write, .read = seq_read, .llseek = seq_lseek, .release = proc_setgroups_release, }; #endif /* CONFIG_USER_NS */ static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { int err = lock_trace(task); if (!err) { seq_printf(m, "%08x\n", task->personality); unlock_trace(task); } return err; } #ifdef CONFIG_LIVEPATCH static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { seq_printf(m, "%d\n", task->patch_state); return 0; } #endif /* CONFIG_LIVEPATCH */ #ifdef CONFIG_KSM static int proc_pid_ksm_merging_pages(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { struct mm_struct *mm; mm = get_task_mm(task); if (mm) { seq_printf(m, "%lu\n", mm->ksm_merging_pages); mmput(mm); } return 0; } static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { struct mm_struct *mm; mm = get_task_mm(task); if (mm) { seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items); seq_printf(m, "ksm_zero_pages %lu\n", mm->ksm_zero_pages); seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages); seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm)); mmput(mm); } return 0; } #endif /* CONFIG_KSM */ #ifdef CONFIG_STACKLEAK_METRICS static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { unsigned long prev_depth = THREAD_SIZE - (task->prev_lowest_stack & (THREAD_SIZE - 1)); unsigned long depth = THREAD_SIZE - (task->lowest_stack & (THREAD_SIZE - 1)); seq_printf(m, "previous stack depth: %lu\nstack depth: %lu\n", prev_depth, depth); return 0; } #endif /* CONFIG_STACKLEAK_METRICS */ /* * Thread groups */ static const struct file_operations proc_task_operations; static const struct inode_operations proc_task_inode_operations; static const struct pid_entry tgid_base_stuff[] = { DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations), DIR("fdinfo", S_IRUGO|S_IXUGO, proc_fdinfo_inode_operations, proc_fdinfo_operations), DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), #ifdef CONFIG_NET DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), #endif REG("environ", S_IRUSR, proc_environ_operations), REG("auxv", S_IRUSR, proc_auxv_operations), ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), ONE("limits", S_IRUGO, proc_pid_limits), #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif #ifdef CONFIG_SCHED_AUTOGROUP REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations), #endif #ifdef CONFIG_TIME_NS REG("timens_offsets", S_IRUGO|S_IWUSR, proc_timens_offsets_operations), #endif REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK ONE("syscall", S_IRUSR, proc_pid_syscall), #endif REG("cmdline", S_IRUGO, proc_pid_cmdline_ops), ONE("stat", S_IRUGO, proc_tgid_stat), ONE("statm", S_IRUGO, proc_pid_statm), REG("maps", S_IRUGO, proc_pid_maps_operations), #ifdef CONFIG_NUMA REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations), #endif REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations), LNK("cwd", proc_cwd_link), LNK("root", proc_root_link), LNK("exe", proc_exe_link), REG("mounts", S_IRUGO, proc_mounts_operations), REG("mountinfo", S_IRUGO, proc_mountinfo_operations), REG("mountstats", S_IRUSR, proc_mountstats_operations), #ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, proc_clear_refs_operations), REG("smaps", S_IRUGO, proc_pid_smaps_operations), REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap", S_IRUSR, proc_pagemap_operations), #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), #endif #ifdef CONFIG_KALLSYMS ONE("wchan", S_IRUGO, proc_pid_wchan), #endif #ifdef CONFIG_STACKTRACE ONE("stack", S_IRUSR, proc_pid_stack), #endif #ifdef CONFIG_SCHED_INFO ONE("schedstat", S_IRUGO, proc_pid_schedstat), #endif #ifdef CONFIG_LATENCYTOP REG("latency", S_IRUGO, proc_lstats_operations), #endif #ifdef CONFIG_PROC_PID_CPUSET ONE("cpuset", S_IRUGO, proc_cpuset_show), #endif #ifdef CONFIG_CGROUPS ONE("cgroup", S_IRUGO, proc_cgroup_show), #endif #ifdef CONFIG_PROC_CPU_RESCTRL ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show), #endif ONE("oom_score", S_IRUGO, proc_oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), #ifdef CONFIG_AUDIT REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), REG("sessionid", S_IRUGO, proc_sessionid_operations), #endif #ifdef CONFIG_FAULT_INJECTION REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), REG("fail-nth", 0644, proc_fail_nth_operations), #endif #ifdef CONFIG_ELF_CORE REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING ONE("io", S_IRUSR, proc_tgid_io_accounting), #endif #ifdef CONFIG_USER_NS REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations), #endif #if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS) REG("timers", S_IRUGO, proc_timers_operations), #endif REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations), #ifdef CONFIG_LIVEPATCH ONE("patch_state", S_IRUSR, proc_pid_patch_state), #endif #ifdef CONFIG_STACKLEAK_METRICS ONE("stack_depth", S_IRUGO, proc_stack_depth), #endif #ifdef CONFIG_PROC_PID_ARCH_STATUS ONE("arch_status", S_IRUGO, proc_pid_arch_status), #endif #ifdef CONFIG_SECCOMP_CACHE_DEBUG ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache), #endif #ifdef CONFIG_KSM ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages), ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat), #endif }; static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) { return proc_pident_readdir(file, ctx, tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff)); } static const struct file_operations proc_tgid_base_operations = { .read = generic_read_dir, .iterate_shared = proc_tgid_base_readdir, .llseek = generic_file_llseek, }; struct pid *tgid_pidfd_to_pid(const struct file *file) { if (file->f_op != &proc_tgid_base_operations) return ERR_PTR(-EBADF); return proc_pid(file_inode(file)); } static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return proc_pident_lookup(dir, dentry, tgid_base_stuff, tgid_base_stuff + ARRAY_SIZE(tgid_base_stuff)); } static const struct inode_operations proc_tgid_base_inode_operations = { .lookup = proc_tgid_base_lookup, .getattr = pid_getattr, .setattr = proc_setattr, .permission = proc_pid_permission, }; /** * proc_flush_pid - Remove dcache entries for @pid from the /proc dcache. * @pid: pid that should be flushed. * * This function walks a list of inodes (that belong to any proc * filesystem) that are attached to the pid and flushes them from * the dentry cache. * * It is safe and reasonable to cache /proc entries for a task until * that task exits. After that they just clog up the dcache with * useless entries, possibly causing useful dcache entries to be * flushed instead. This routine is provided to flush those useless * dcache entries when a process is reaped. * * NOTE: This routine is just an optimization so it does not guarantee * that no dcache entries will exist after a process is reaped * it just makes it very unlikely that any will persist. */ void proc_flush_pid(struct pid *pid) { proc_invalidate_siblings_dcache(&pid->inodes, &pid->lock); } static struct dentry *proc_pid_instantiate(struct dentry * dentry, struct task_struct *task, const void *ptr) { struct inode *inode; inode = proc_pid_make_base_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO); if (!inode) return ERR_PTR(-ENOENT); inode->i_op = &proc_tgid_base_inode_operations; inode->i_fop = &proc_tgid_base_operations; inode->i_flags|=S_IMMUTABLE; set_nlink(inode, nlink_tgid); pid_update_inode(task, inode); d_set_d_op(dentry, &pid_dentry_operations); return d_splice_alias(inode, dentry); } struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags) { struct task_struct *task; unsigned tgid; struct proc_fs_info *fs_info; struct pid_namespace *ns; struct dentry *result = ERR_PTR(-ENOENT); tgid = name_to_int(&dentry->d_name); if (tgid == ~0U) goto out; fs_info = proc_sb_info(dentry->d_sb); ns = fs_info->pid_ns; rcu_read_lock(); task = find_task_by_pid_ns(tgid, ns); if (task) get_task_struct(task); rcu_read_unlock(); if (!task) goto out; /* Limit procfs to only ptraceable tasks */ if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE) { if (!has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS)) goto out_put_task; } result = proc_pid_instantiate(dentry, task, NULL); out_put_task: put_task_struct(task); out: return result; } /* * Find the first task with tgid >= tgid * */ struct tgid_iter { unsigned int tgid; struct task_struct *task; }; static struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter iter) { struct pid *pid; if (iter.task) put_task_struct(iter.task); rcu_read_lock(); retry: iter.task = NULL; pid = find_ge_pid(iter.tgid, ns); if (pid) { iter.tgid = pid_nr_ns(pid, ns); iter.task = pid_task(pid, PIDTYPE_TGID); if (!iter.task) { iter.tgid += 1; goto retry; } get_task_struct(iter.task); } rcu_read_unlock(); return iter; } #define TGID_OFFSET (FIRST_PROCESS_ENTRY + 2) /* for the /proc/ directory itself, after non-process stuff has been done */ int proc_pid_readdir(struct file *file, struct dir_context *ctx) { struct tgid_iter iter; struct proc_fs_info *fs_info = proc_sb_info(file_inode(file)->i_sb); struct pid_namespace *ns = proc_pid_ns(file_inode(file)->i_sb); loff_t pos = ctx->pos; if (pos >= PID_MAX_LIMIT + TGID_OFFSET) return 0; if (pos == TGID_OFFSET - 2) { struct inode *inode = d_inode(fs_info->proc_self); if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK)) return 0; ctx->pos = pos = pos + 1; } if (pos == TGID_OFFSET - 1) { struct inode *inode = d_inode(fs_info->proc_thread_self); if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK)) return 0; ctx->pos = pos = pos + 1; } iter.tgid = pos - TGID_OFFSET; iter.task = NULL; for (iter = next_tgid(ns, iter); iter.task; iter.tgid += 1, iter = next_tgid(ns, iter)) { char name[10 + 1]; unsigned int len; cond_resched(); if (!has_pid_permissions(fs_info, iter.task, HIDEPID_INVISIBLE)) continue; len = snprintf(name, sizeof(name), "%u", iter.tgid); ctx->pos = iter.tgid + TGID_OFFSET; if (!proc_fill_cache(file, ctx, name, len, proc_pid_instantiate, iter.task, NULL)) { put_task_struct(iter.task); return 0; } } ctx->pos = PID_MAX_LIMIT + TGID_OFFSET; return 0; } /* * proc_tid_comm_permission is a special permission function exclusively * used for the node /proc/<pid>/task/<tid>/comm. * It bypasses generic permission checks in the case where a task of the same * task group attempts to access the node. * The rationale behind this is that glibc and bionic access this node for * cross thread naming (pthread_set/getname_np(!self)). However, if * PR_SET_DUMPABLE gets set to 0 this node among others becomes uid=0 gid=0, * which locks out the cross thread naming implementation. * This function makes sure that the node is always accessible for members of * same thread group. */ static int proc_tid_comm_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { bool is_same_tgroup; struct task_struct *task; task = get_proc_task(inode); if (!task) return -ESRCH; is_same_tgroup = same_thread_group(current, task); put_task_struct(task); if (likely(is_same_tgroup && !(mask & MAY_EXEC))) { /* This file (/proc/<pid>/task/<tid>/comm) can always be * read or written by the members of the corresponding * thread group. */ return 0; } return generic_permission(&nop_mnt_idmap, inode, mask); } static const struct inode_operations proc_tid_comm_inode_operations = { .setattr = proc_setattr, .permission = proc_tid_comm_permission, }; /* * Tasks */ static const struct pid_entry tid_base_stuff[] = { DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), DIR("fdinfo", S_IRUGO|S_IXUGO, proc_fdinfo_inode_operations, proc_fdinfo_operations), DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), #ifdef CONFIG_NET DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), #endif REG("environ", S_IRUSR, proc_environ_operations), REG("auxv", S_IRUSR, proc_auxv_operations), ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), ONE("limits", S_IRUGO, proc_pid_limits), #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif NOD("comm", S_IFREG|S_IRUGO|S_IWUSR, &proc_tid_comm_inode_operations, &proc_pid_set_comm_operations, {}), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK ONE("syscall", S_IRUSR, proc_pid_syscall), #endif REG("cmdline", S_IRUGO, proc_pid_cmdline_ops), ONE("stat", S_IRUGO, proc_tid_stat), ONE("statm", S_IRUGO, proc_pid_statm), REG("maps", S_IRUGO, proc_pid_maps_operations), #ifdef CONFIG_PROC_CHILDREN REG("children", S_IRUGO, proc_tid_children_operations), #endif #ifdef CONFIG_NUMA REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations), #endif REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations), LNK("cwd", proc_cwd_link), LNK("root", proc_root_link), LNK("exe", proc_exe_link), REG("mounts", S_IRUGO, proc_mounts_operations), REG("mountinfo", S_IRUGO, proc_mountinfo_operations), #ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, proc_clear_refs_operations), REG("smaps", S_IRUGO, proc_pid_smaps_operations), REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap", S_IRUSR, proc_pagemap_operations), #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), #endif #ifdef CONFIG_KALLSYMS ONE("wchan", S_IRUGO, proc_pid_wchan), #endif #ifdef CONFIG_STACKTRACE ONE("stack", S_IRUSR, proc_pid_stack), #endif #ifdef CONFIG_SCHED_INFO ONE("schedstat", S_IRUGO, proc_pid_schedstat), #endif #ifdef CONFIG_LATENCYTOP REG("latency", S_IRUGO, proc_lstats_operations), #endif #ifdef CONFIG_PROC_PID_CPUSET ONE("cpuset", S_IRUGO, proc_cpuset_show), #endif #ifdef CONFIG_CGROUPS ONE("cgroup", S_IRUGO, proc_cgroup_show), #endif #ifdef CONFIG_PROC_CPU_RESCTRL ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show), #endif ONE("oom_score", S_IRUGO, proc_oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), #ifdef CONFIG_AUDIT REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), REG("sessionid", S_IRUGO, proc_sessionid_operations), #endif #ifdef CONFIG_FAULT_INJECTION REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), REG("fail-nth", 0644, proc_fail_nth_operations), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING ONE("io", S_IRUSR, proc_tid_io_accounting), #endif #ifdef CONFIG_USER_NS REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations), #endif #ifdef CONFIG_LIVEPATCH ONE("patch_state", S_IRUSR, proc_pid_patch_state), #endif #ifdef CONFIG_PROC_PID_ARCH_STATUS ONE("arch_status", S_IRUGO, proc_pid_arch_status), #endif #ifdef CONFIG_SECCOMP_CACHE_DEBUG ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache), #endif #ifdef CONFIG_KSM ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages), ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat), #endif }; static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) { return proc_pident_readdir(file, ctx, tid_base_stuff, ARRAY_SIZE(tid_base_stuff)); } static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return proc_pident_lookup(dir, dentry, tid_base_stuff, tid_base_stuff + ARRAY_SIZE(tid_base_stuff)); } static const struct file_operations proc_tid_base_operations = { .read = generic_read_dir, .iterate_shared = proc_tid_base_readdir, .llseek = generic_file_llseek, }; static const struct inode_operations proc_tid_base_inode_operations = { .lookup = proc_tid_base_lookup, .getattr = pid_getattr, .setattr = proc_setattr, }; static struct dentry *proc_task_instantiate(struct dentry *dentry, struct task_struct *task, const void *ptr) { struct inode *inode; inode = proc_pid_make_base_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO); if (!inode) return ERR_PTR(-ENOENT); inode->i_op = &proc_tid_base_inode_operations; inode->i_fop = &proc_tid_base_operations; inode->i_flags |= S_IMMUTABLE; set_nlink(inode, nlink_tid); pid_update_inode(task, inode); d_set_d_op(dentry, &pid_dentry_operations); return d_splice_alias(inode, dentry); } static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) { struct task_struct *task; struct task_struct *leader = get_proc_task(dir); unsigned tid; struct proc_fs_info *fs_info; struct pid_namespace *ns; struct dentry *result = ERR_PTR(-ENOENT); if (!leader) goto out_no_task; tid = name_to_int(&dentry->d_name); if (tid == ~0U) goto out; fs_info = proc_sb_info(dentry->d_sb); ns = fs_info->pid_ns; rcu_read_lock(); task = find_task_by_pid_ns(tid, ns); if (task) get_task_struct(task); rcu_read_unlock(); if (!task) goto out; if (!same_thread_group(leader, task)) goto out_drop_task; result = proc_task_instantiate(dentry, task, NULL); out_drop_task: put_task_struct(task); out: put_task_struct(leader); out_no_task: return result; } /* * Find the first tid of a thread group to return to user space. * * Usually this is just the thread group leader, but if the users * buffer was too small or there was a seek into the middle of the * directory we have more work todo. * * In the case of a short read we start with find_task_by_pid. * * In the case of a seek we start with the leader and walk nr * threads past it. */ static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos, struct pid_namespace *ns) { struct task_struct *pos, *task; unsigned long nr = f_pos; if (nr != f_pos) /* 32bit overflow? */ return NULL; rcu_read_lock(); task = pid_task(pid, PIDTYPE_PID); if (!task) goto fail; /* Attempt to start with the tid of a thread */ if (tid && nr) { pos = find_task_by_pid_ns(tid, ns); if (pos && same_thread_group(pos, task)) goto found; } /* If nr exceeds the number of threads there is nothing todo */ if (nr >= get_nr_threads(task)) goto fail; /* If we haven't found our starting place yet start * with the leader and walk nr threads forward. */ for_each_thread(task, pos) { if (!nr--) goto found; }; fail: pos = NULL; goto out; found: get_task_struct(pos); out: rcu_read_unlock(); return pos; } /* * Find the next thread in the thread list. * Return NULL if there is an error or no next thread. * * The reference to the input task_struct is released. */ static struct task_struct *next_tid(struct task_struct *start) { struct task_struct *pos = NULL; rcu_read_lock(); if (pid_alive(start)) { pos = next_thread(start); if (thread_group_leader(pos)) pos = NULL; else get_task_struct(pos); } rcu_read_unlock(); put_task_struct(start); return pos; } /* for the /proc/TGID/task/ directories */ static int proc_task_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); struct task_struct *task; struct pid_namespace *ns; int tid; if (proc_inode_is_dead(inode)) return -ENOENT; if (!dir_emit_dots(file, ctx)) return 0; /* f_version caches the tgid value that the last readdir call couldn't * return. lseek aka telldir automagically resets f_version to 0. */ ns = proc_pid_ns(inode->i_sb); tid = (int)file->f_version; file->f_version = 0; for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns); task; task = next_tid(task), ctx->pos++) { char name[10 + 1]; unsigned int len; tid = task_pid_nr_ns(task, ns); if (!tid) continue; /* The task has just exited. */ len = snprintf(name, sizeof(name), "%u", tid); if (!proc_fill_cache(file, ctx, name, len, proc_task_instantiate, task, NULL)) { /* returning this tgid failed, save it as the first * pid for the next readir call */ file->f_version = (u64)tid; put_task_struct(task); break; } } return 0; } static int proc_task_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct task_struct *p = get_proc_task(inode); generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); if (p) { stat->nlink += get_nr_threads(p); put_task_struct(p); } return 0; } static const struct inode_operations proc_task_inode_operations = { .lookup = proc_task_lookup, .getattr = proc_task_getattr, .setattr = proc_setattr, .permission = proc_pid_permission, }; static const struct file_operations proc_task_operations = { .read = generic_read_dir, .iterate_shared = proc_task_readdir, .llseek = generic_file_llseek, }; void __init set_proc_pid_nlink(void) { nlink_tid = pid_entry_nlink(tid_base_stuff, ARRAY_SIZE(tid_base_stuff)); nlink_tgid = pid_entry_nlink(tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff)); } |
721 7008 1794 5968 594 38 7743 1846 1300 8 2608 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Berkeley style UIO structures - Alan Cox 1994. */ #ifndef __LINUX_UIO_H #define __LINUX_UIO_H #include <linux/kernel.h> #include <linux/thread_info.h> #include <linux/mm_types.h> #include <uapi/linux/uio.h> struct page; typedef unsigned int __bitwise iov_iter_extraction_t; struct kvec { void *iov_base; /* and that should *never* hold a userland pointer */ size_t iov_len; }; enum iter_type { /* iter types */ ITER_IOVEC, ITER_KVEC, ITER_BVEC, ITER_XARRAY, ITER_DISCARD, ITER_UBUF, }; #define ITER_SOURCE 1 // == WRITE #define ITER_DEST 0 // == READ struct iov_iter_state { size_t iov_offset; size_t count; unsigned long nr_segs; }; struct iov_iter { u8 iter_type; bool copy_mc; bool nofault; bool data_source; bool user_backed; union { size_t iov_offset; int last_offset; }; /* * Hack alert: overlay ubuf_iovec with iovec + count, so * that the members resolve correctly regardless of the type * of iterator used. This means that you can use: * * &iter->__ubuf_iovec or iter->__iov * * interchangably for the user_backed cases, hence simplifying * some of the cases that need to deal with both. */ union { /* * This really should be a const, but we cannot do that without * also modifying any of the zero-filling iter init functions. * Leave it non-const for now, but it should be treated as such. */ struct iovec __ubuf_iovec; struct { union { /* use iter_iov() to get the current vec */ const struct iovec *__iov; const struct kvec *kvec; const struct bio_vec *bvec; struct xarray *xarray; void __user *ubuf; }; size_t count; }; }; union { unsigned long nr_segs; loff_t xarray_start; }; }; static inline const struct iovec *iter_iov(const struct iov_iter *iter) { if (iter->iter_type == ITER_UBUF) return (const struct iovec *) &iter->__ubuf_iovec; return iter->__iov; } #define iter_iov_addr(iter) (iter_iov(iter)->iov_base + (iter)->iov_offset) #define iter_iov_len(iter) (iter_iov(iter)->iov_len - (iter)->iov_offset) static inline enum iter_type iov_iter_type(const struct iov_iter *i) { return i->iter_type; } static inline void iov_iter_save_state(struct iov_iter *iter, struct iov_iter_state *state) { state->iov_offset = iter->iov_offset; state->count = iter->count; state->nr_segs = iter->nr_segs; } static inline bool iter_is_ubuf(const struct iov_iter *i) { return iov_iter_type(i) == ITER_UBUF; } static inline bool iter_is_iovec(const struct iov_iter *i) { return iov_iter_type(i) == ITER_IOVEC; } static inline bool iov_iter_is_kvec(const struct iov_iter *i) { return iov_iter_type(i) == ITER_KVEC; } static inline bool iov_iter_is_bvec(const struct iov_iter *i) { return iov_iter_type(i) == ITER_BVEC; } static inline bool iov_iter_is_discard(const struct iov_iter *i) { return iov_iter_type(i) == ITER_DISCARD; } static inline bool iov_iter_is_xarray(const struct iov_iter *i) { return iov_iter_type(i) == ITER_XARRAY; } static inline unsigned char iov_iter_rw(const struct iov_iter *i) { return i->data_source ? WRITE : READ; } static inline bool user_backed_iter(const struct iov_iter *i) { return i->user_backed; } /* * Total number of bytes covered by an iovec. * * NOTE that it is not safe to use this function until all the iovec's * segment lengths have been validated. Because the individual lengths can * overflow a size_t when added together. */ static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs) { unsigned long seg; size_t ret = 0; for (seg = 0; seg < nr_segs; seg++) ret += iov[seg].iov_len; return ret; } size_t copy_page_from_iter_atomic(struct page *page, size_t offset, size_t bytes, struct iov_iter *i); void iov_iter_advance(struct iov_iter *i, size_t bytes); void iov_iter_revert(struct iov_iter *i, size_t bytes); size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t bytes); size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t bytes); size_t iov_iter_single_seg_count(const struct iov_iter *i); size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i); size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i); size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i); size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i); size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i); static inline size_t copy_folio_to_iter(struct folio *folio, size_t offset, size_t bytes, struct iov_iter *i) { return copy_page_to_iter(&folio->page, offset, bytes, i); } static inline size_t copy_folio_from_iter_atomic(struct folio *folio, size_t offset, size_t bytes, struct iov_iter *i) { return copy_page_from_iter_atomic(&folio->page, offset, bytes, i); } size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t bytes, struct iov_iter *i); static __always_inline __must_check size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { if (check_copy_size(addr, bytes, true)) return _copy_to_iter(addr, bytes, i); return 0; } static __always_inline __must_check size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) { if (check_copy_size(addr, bytes, false)) return _copy_from_iter(addr, bytes, i); return 0; } static __always_inline __must_check bool copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i) { size_t copied = copy_from_iter(addr, bytes, i); if (likely(copied == bytes)) return true; iov_iter_revert(i, copied); return false; } static __always_inline __must_check size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) { if (check_copy_size(addr, bytes, false)) return _copy_from_iter_nocache(addr, bytes, i); return 0; } static __always_inline __must_check bool copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i) { size_t copied = copy_from_iter_nocache(addr, bytes, i); if (likely(copied == bytes)) return true; iov_iter_revert(i, copied); return false; } #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE /* * Note, users like pmem that depend on the stricter semantics of * _copy_from_iter_flushcache() than _copy_from_iter_nocache() must check for * IS_ENABLED(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) before assuming that the * destination is flushed from the cache on return. */ size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i); #else #define _copy_from_iter_flushcache _copy_from_iter_nocache #endif #ifdef CONFIG_ARCH_HAS_COPY_MC size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i); static inline void iov_iter_set_copy_mc(struct iov_iter *i) { i->copy_mc = true; } static inline bool iov_iter_is_copy_mc(const struct iov_iter *i) { return i->copy_mc; } #else #define _copy_mc_to_iter _copy_to_iter static inline void iov_iter_set_copy_mc(struct iov_iter *i) { } static inline bool iov_iter_is_copy_mc(const struct iov_iter *i) { return false; } #endif size_t iov_iter_zero(size_t bytes, struct iov_iter *); bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask, unsigned len_mask); unsigned long iov_iter_alignment(const struct iov_iter *i); unsigned long iov_iter_gap_alignment(const struct iov_iter *i); void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov, unsigned long nr_segs, size_t count); void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec *kvec, unsigned long nr_segs, size_t count); void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec, unsigned long nr_segs, size_t count); void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count); void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray, loff_t start, size_t count); ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start); ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, struct page ***pages, size_t maxsize, size_t *start); int iov_iter_npages(const struct iov_iter *i, int maxpages); void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state); const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags); static inline size_t iov_iter_count(const struct iov_iter *i) { return i->count; } /* * Cap the iov_iter by given limit; note that the second argument is * *not* the new size - it's upper limit for such. Passing it a value * greater than the amount of data in iov_iter is fine - it'll just do * nothing in that case. */ static inline void iov_iter_truncate(struct iov_iter *i, u64 count) { /* * count doesn't have to fit in size_t - comparison extends both * operands to u64 here and any value that would be truncated by * conversion in assignement is by definition greater than all * values of size_t, including old i->count. */ if (i->count > count) i->count = count; } /* * reexpand a previously truncated iterator; count must be no more than how much * we had shrunk it. */ static inline void iov_iter_reexpand(struct iov_iter *i, size_t count) { i->count = count; } static inline int iov_iter_npages_cap(struct iov_iter *i, int maxpages, size_t max_bytes) { size_t shorted = 0; int npages; if (iov_iter_count(i) > max_bytes) { shorted = iov_iter_count(i) - max_bytes; iov_iter_truncate(i, max_bytes); } npages = iov_iter_npages(i, maxpages); if (shorted) iov_iter_reexpand(i, iov_iter_count(i) + shorted); return npages; } struct csum_state { __wsum csum; size_t off; }; size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *csstate, struct iov_iter *i); size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i); static __always_inline __must_check bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i) { size_t copied = csum_and_copy_from_iter(addr, bytes, csum, i); if (likely(copied == bytes)) return true; iov_iter_revert(i, copied); return false; } size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp, struct iov_iter *i); struct iovec *iovec_from_user(const struct iovec __user *uvector, unsigned long nr_segs, unsigned long fast_segs, struct iovec *fast_iov, bool compat); ssize_t import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i); ssize_t __import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i, bool compat); int import_single_range(int type, void __user *buf, size_t len, struct iovec *iov, struct iov_iter *i); int import_ubuf(int type, void __user *buf, size_t len, struct iov_iter *i); static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction, void __user *buf, size_t count) { WARN_ON(direction & ~(READ | WRITE)); *i = (struct iov_iter) { .iter_type = ITER_UBUF, .copy_mc = false, .user_backed = true, .data_source = direction, .ubuf = buf, .count = count, .nr_segs = 1 }; } /* Flags for iov_iter_get/extract_pages*() */ /* Allow P2PDMA on the extracted pages */ #define ITER_ALLOW_P2PDMA ((__force iov_iter_extraction_t)0x01) ssize_t iov_iter_extract_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0); /** * iov_iter_extract_will_pin - Indicate how pages from the iterator will be retained * @iter: The iterator * * Examine the iterator and indicate by returning true or false as to how, if * at all, pages extracted from the iterator will be retained by the extraction * function. * * %true indicates that the pages will have a pin placed in them that the * caller must unpin. This is must be done for DMA/async DIO to force fork() * to forcibly copy a page for the child (the parent must retain the original * page). * * %false indicates that no measures are taken and that it's up to the caller * to retain the pages. */ static inline bool iov_iter_extract_will_pin(const struct iov_iter *iter) { return user_backed_iter(iter); } struct sg_table; ssize_t extract_iter_to_sg(struct iov_iter *iter, size_t len, struct sg_table *sgtable, unsigned int sg_max, iov_iter_extraction_t extraction_flags); #endif |
1 5960 1613 2 3361 3261 658 3390 1 1811 402 3390 3390 3342 3306 1078 3387 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM block #if !defined(_TRACE_BLOCK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_BLOCK_H #include <linux/blktrace_api.h> #include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/tracepoint.h> #define RWBS_LEN 8 #ifdef CONFIG_BUFFER_HEAD DECLARE_EVENT_CLASS(block_buffer, TP_PROTO(struct buffer_head *bh), TP_ARGS(bh), TP_STRUCT__entry ( __field( dev_t, dev ) __field( sector_t, sector ) __field( size_t, size ) ), TP_fast_assign( __entry->dev = bh->b_bdev->bd_dev; __entry->sector = bh->b_blocknr; __entry->size = bh->b_size; ), TP_printk("%d,%d sector=%llu size=%zu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->sector, __entry->size ) ); /** * block_touch_buffer - mark a buffer accessed * @bh: buffer_head being touched * * Called from touch_buffer(). */ DEFINE_EVENT(block_buffer, block_touch_buffer, TP_PROTO(struct buffer_head *bh), TP_ARGS(bh) ); /** * block_dirty_buffer - mark a buffer dirty * @bh: buffer_head being dirtied * * Called from mark_buffer_dirty(). */ DEFINE_EVENT(block_buffer, block_dirty_buffer, TP_PROTO(struct buffer_head *bh), TP_ARGS(bh) ); #endif /* CONFIG_BUFFER_HEAD */ /** * block_rq_requeue - place block IO request back on a queue * @rq: block IO operation request * * The block operation request @rq is being placed back into queue * @q. For some reason the request was not completed and needs to be * put back in the queue. */ TRACE_EVENT(block_rq_requeue, TP_PROTO(struct request *rq), TP_ARGS(rq), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned int, nr_sector ) __array( char, rwbs, RWBS_LEN ) __dynamic_array( char, cmd, 1 ) ), TP_fast_assign( __entry->dev = rq->q->disk ? disk_devt(rq->q->disk) : 0; __entry->sector = blk_rq_trace_sector(rq); __entry->nr_sector = blk_rq_trace_nr_sectors(rq); blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); __get_str(cmd)[0] = '\0'; ), TP_printk("%d,%d %s (%s) %llu + %u [%d]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, __get_str(cmd), (unsigned long long)__entry->sector, __entry->nr_sector, 0) ); DECLARE_EVENT_CLASS(block_rq_completion, TP_PROTO(struct request *rq, blk_status_t error, unsigned int nr_bytes), TP_ARGS(rq, error, nr_bytes), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned int, nr_sector ) __field( int , error ) __array( char, rwbs, RWBS_LEN ) __dynamic_array( char, cmd, 1 ) ), TP_fast_assign( __entry->dev = rq->q->disk ? disk_devt(rq->q->disk) : 0; __entry->sector = blk_rq_pos(rq); __entry->nr_sector = nr_bytes >> 9; __entry->error = blk_status_to_errno(error); blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); __get_str(cmd)[0] = '\0'; ), TP_printk("%d,%d %s (%s) %llu + %u [%d]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, __get_str(cmd), (unsigned long long)__entry->sector, __entry->nr_sector, __entry->error) ); /** * block_rq_complete - block IO operation completed by device driver * @rq: block operations request * @error: status code * @nr_bytes: number of completed bytes * * The block_rq_complete tracepoint event indicates that some portion * of operation request has been completed by the device driver. If * the @rq->bio is %NULL, then there is absolutely no additional work to * do for the request. If @rq->bio is non-NULL then there is * additional work required to complete the request. */ DEFINE_EVENT(block_rq_completion, block_rq_complete, TP_PROTO(struct request *rq, blk_status_t error, unsigned int nr_bytes), TP_ARGS(rq, error, nr_bytes) ); /** * block_rq_error - block IO operation error reported by device driver * @rq: block operations request * @error: status code * @nr_bytes: number of completed bytes * * The block_rq_error tracepoint event indicates that some portion * of operation request has failed as reported by the device driver. */ DEFINE_EVENT(block_rq_completion, block_rq_error, TP_PROTO(struct request *rq, blk_status_t error, unsigned int nr_bytes), TP_ARGS(rq, error, nr_bytes) ); DECLARE_EVENT_CLASS(block_rq, TP_PROTO(struct request *rq), TP_ARGS(rq), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned int, nr_sector ) __field( unsigned int, bytes ) __array( char, rwbs, RWBS_LEN ) __array( char, comm, TASK_COMM_LEN ) __dynamic_array( char, cmd, 1 ) ), TP_fast_assign( __entry->dev = rq->q->disk ? disk_devt(rq->q->disk) : 0; __entry->sector = blk_rq_trace_sector(rq); __entry->nr_sector = blk_rq_trace_nr_sectors(rq); __entry->bytes = blk_rq_bytes(rq); blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); __get_str(cmd)[0] = '\0'; memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), TP_printk("%d,%d %s %u (%s) %llu + %u [%s]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, __entry->bytes, __get_str(cmd), (unsigned long long)__entry->sector, __entry->nr_sector, __entry->comm) ); /** * block_rq_insert - insert block operation request into queue * @rq: block IO operation request * * Called immediately before block operation request @rq is inserted * into queue @q. The fields in the operation request @rq struct can * be examined to determine which device and sectors the pending * operation would access. */ DEFINE_EVENT(block_rq, block_rq_insert, TP_PROTO(struct request *rq), TP_ARGS(rq) ); /** * block_rq_issue - issue pending block IO request operation to device driver * @rq: block IO operation request * * Called when block operation request @rq from queue @q is sent to a * device driver for processing. */ DEFINE_EVENT(block_rq, block_rq_issue, TP_PROTO(struct request *rq), TP_ARGS(rq) ); /** * block_rq_merge - merge request with another one in the elevator * @rq: block IO operation request * * Called when block operation request @rq from queue @q is merged to another * request queued in the elevator. */ DEFINE_EVENT(block_rq, block_rq_merge, TP_PROTO(struct request *rq), TP_ARGS(rq) ); /** * block_io_start - insert a request for execution * @rq: block IO operation request * * Called when block operation request @rq is queued for execution */ DEFINE_EVENT(block_rq, block_io_start, TP_PROTO(struct request *rq), TP_ARGS(rq) ); /** * block_io_done - block IO operation request completed * @rq: block IO operation request * * Called when block operation request @rq is completed */ DEFINE_EVENT(block_rq, block_io_done, TP_PROTO(struct request *rq), TP_ARGS(rq) ); /** * block_bio_complete - completed all work on the block operation * @q: queue holding the block operation * @bio: block operation completed * * This tracepoint indicates there is no further work to do on this * block IO operation @bio. */ TRACE_EVENT(block_bio_complete, TP_PROTO(struct request_queue *q, struct bio *bio), TP_ARGS(q, bio), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned, nr_sector ) __field( int, error ) __array( char, rwbs, RWBS_LEN) ), TP_fast_assign( __entry->dev = bio_dev(bio); __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); __entry->error = blk_status_to_errno(bio->bi_status); blk_fill_rwbs(__entry->rwbs, bio->bi_opf); ), TP_printk("%d,%d %s %llu + %u [%d]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, (unsigned long long)__entry->sector, __entry->nr_sector, __entry->error) ); DECLARE_EVENT_CLASS(block_bio, TP_PROTO(struct bio *bio), TP_ARGS(bio), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned int, nr_sector ) __array( char, rwbs, RWBS_LEN ) __array( char, comm, TASK_COMM_LEN ) ), TP_fast_assign( __entry->dev = bio_dev(bio); __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); blk_fill_rwbs(__entry->rwbs, bio->bi_opf); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), TP_printk("%d,%d %s %llu + %u [%s]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, (unsigned long long)__entry->sector, __entry->nr_sector, __entry->comm) ); /** * block_bio_bounce - used bounce buffer when processing block operation * @bio: block operation * * A bounce buffer was used to handle the block operation @bio in @q. * This occurs when hardware limitations prevent a direct transfer of * data between the @bio data memory area and the IO device. Use of a * bounce buffer requires extra copying of data and decreases * performance. */ DEFINE_EVENT(block_bio, block_bio_bounce, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); /** * block_bio_backmerge - merging block operation to the end of an existing operation * @bio: new block operation to merge * * Merging block request @bio to the end of an existing block request. */ DEFINE_EVENT(block_bio, block_bio_backmerge, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); /** * block_bio_frontmerge - merging block operation to the beginning of an existing operation * @bio: new block operation to merge * * Merging block IO operation @bio to the beginning of an existing block request. */ DEFINE_EVENT(block_bio, block_bio_frontmerge, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); /** * block_bio_queue - putting new block IO operation in queue * @bio: new block operation * * About to place the block IO operation @bio into queue @q. */ DEFINE_EVENT(block_bio, block_bio_queue, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); /** * block_getrq - get a free request entry in queue for block IO operations * @bio: pending block IO operation (can be %NULL) * * A request struct has been allocated to handle the block IO operation @bio. */ DEFINE_EVENT(block_bio, block_getrq, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); /** * block_plug - keep operations requests in request queue * @q: request queue to plug * * Plug the request queue @q. Do not allow block operation requests * to be sent to the device driver. Instead, accumulate requests in * the queue to improve throughput performance of the block device. */ TRACE_EVENT(block_plug, TP_PROTO(struct request_queue *q), TP_ARGS(q), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) ), TP_fast_assign( memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), TP_printk("[%s]", __entry->comm) ); DECLARE_EVENT_CLASS(block_unplug, TP_PROTO(struct request_queue *q, unsigned int depth, bool explicit), TP_ARGS(q, depth, explicit), TP_STRUCT__entry( __field( int, nr_rq ) __array( char, comm, TASK_COMM_LEN ) ), TP_fast_assign( __entry->nr_rq = depth; memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), TP_printk("[%s] %d", __entry->comm, __entry->nr_rq) ); /** * block_unplug - release of operations requests in request queue * @q: request queue to unplug * @depth: number of requests just added to the queue * @explicit: whether this was an explicit unplug, or one from schedule() * * Unplug request queue @q because device driver is scheduled to work * on elements in the request queue. */ DEFINE_EVENT(block_unplug, block_unplug, TP_PROTO(struct request_queue *q, unsigned int depth, bool explicit), TP_ARGS(q, depth, explicit) ); /** * block_split - split a single bio struct into two bio structs * @bio: block operation being split * @new_sector: The starting sector for the new bio * * The bio request @bio needs to be split into two bio requests. The newly * created @bio request starts at @new_sector. This split may be required due to * hardware limitations such as operation crossing device boundaries in a RAID * system. */ TRACE_EVENT(block_split, TP_PROTO(struct bio *bio, unsigned int new_sector), TP_ARGS(bio, new_sector), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( sector_t, new_sector ) __array( char, rwbs, RWBS_LEN ) __array( char, comm, TASK_COMM_LEN ) ), TP_fast_assign( __entry->dev = bio_dev(bio); __entry->sector = bio->bi_iter.bi_sector; __entry->new_sector = new_sector; blk_fill_rwbs(__entry->rwbs, bio->bi_opf); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), TP_printk("%d,%d %s %llu / %llu [%s]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, (unsigned long long)__entry->sector, (unsigned long long)__entry->new_sector, __entry->comm) ); /** * block_bio_remap - map request for a logical device to the raw device * @bio: revised operation * @dev: original device for the operation * @from: original sector for the operation * * An operation for a logical device has been mapped to the * raw block device. */ TRACE_EVENT(block_bio_remap, TP_PROTO(struct bio *bio, dev_t dev, sector_t from), TP_ARGS(bio, dev, from), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned int, nr_sector ) __field( dev_t, old_dev ) __field( sector_t, old_sector ) __array( char, rwbs, RWBS_LEN) ), TP_fast_assign( __entry->dev = bio_dev(bio); __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); __entry->old_dev = dev; __entry->old_sector = from; blk_fill_rwbs(__entry->rwbs, bio->bi_opf); ), TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, (unsigned long long)__entry->sector, __entry->nr_sector, MAJOR(__entry->old_dev), MINOR(__entry->old_dev), (unsigned long long)__entry->old_sector) ); /** * block_rq_remap - map request for a block operation request * @rq: block IO operation request * @dev: device for the operation * @from: original sector for the operation * * The block operation request @rq in @q has been remapped. The block * operation request @rq holds the current information and @from hold * the original sector. */ TRACE_EVENT(block_rq_remap, TP_PROTO(struct request *rq, dev_t dev, sector_t from), TP_ARGS(rq, dev, from), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned int, nr_sector ) __field( dev_t, old_dev ) __field( sector_t, old_sector ) __field( unsigned int, nr_bios ) __array( char, rwbs, RWBS_LEN) ), TP_fast_assign( __entry->dev = disk_devt(rq->q->disk); __entry->sector = blk_rq_pos(rq); __entry->nr_sector = blk_rq_sectors(rq); __entry->old_dev = dev; __entry->old_sector = from; __entry->nr_bios = blk_rq_count_bios(rq); blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); ), TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, (unsigned long long)__entry->sector, __entry->nr_sector, MAJOR(__entry->old_dev), MINOR(__entry->old_dev), (unsigned long long)__entry->old_sector, __entry->nr_bios) ); #endif /* _TRACE_BLOCK_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
2 2 2 2 2 1630 139 140 140 140 1630 2 2 2 2 2 4 3 2 1 1 1 1 1 1 4 5 5 5 5 5 5 14 11 2 14 10 10 5 5 5 1 1 1 2 1 2 2 4 4 4 4 4 6 6 5 2 4 2 6 12 2 10 5 6 2 1 3 12 7 2 1 7 7 5 1 1 1 1 1 7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 | // SPDX-License-Identifier: GPL-2.0-or-later /* * IPVS An implementation of the IP virtual server support for the * LINUX operating system. IPVS is now implemented as a module * over the NetFilter framework. IPVS can be used to build a * high-performance and highly available server based on a * cluster of servers. * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * Peter Kese <peter.kese@ijs.si> * Julian Anastasov <ja@ssi.bg> * * Changes: */ #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/module.h> #include <linux/init.h> #include <linux/types.h> #include <linux/capability.h> #include <linux/fs.h> #include <linux/sysctl.h> #include <linux/proc_fs.h> #include <linux/workqueue.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/mutex.h> #include <net/net_namespace.h> #include <linux/nsproxy.h> #include <net/ip.h> #ifdef CONFIG_IP_VS_IPV6 #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/netfilter/ipv6/nf_defrag_ipv6.h> #endif #include <net/route.h> #include <net/sock.h> #include <net/genetlink.h> #include <linux/uaccess.h> #include <net/ip_vs.h> MODULE_ALIAS_GENL_FAMILY(IPVS_GENL_NAME); DEFINE_MUTEX(__ip_vs_mutex); /* Serialize configuration with sockopt/netlink */ /* sysctl variables */ #ifdef CONFIG_IP_VS_DEBUG static int sysctl_ip_vs_debug_level = 0; int ip_vs_get_debug_level(void) { return sysctl_ip_vs_debug_level; } #endif /* Protos */ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup); #ifdef CONFIG_IP_VS_IPV6 /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */ static bool __ip_vs_addr_is_local_v6(struct net *net, const struct in6_addr *addr) { struct flowi6 fl6 = { .daddr = *addr, }; struct dst_entry *dst = ip6_route_output(net, NULL, &fl6); bool is_local; is_local = !dst->error && dst->dev && (dst->dev->flags & IFF_LOOPBACK); dst_release(dst); return is_local; } #endif #ifdef CONFIG_SYSCTL /* * update_defense_level is called from keventd and from sysctl, * so it needs to protect itself from softirqs */ static void update_defense_level(struct netns_ipvs *ipvs) { struct sysinfo i; int availmem; int nomem; int to_change = -1; /* we only count free and buffered memory (in pages) */ si_meminfo(&i); availmem = i.freeram + i.bufferram; /* however in linux 2.5 the i.bufferram is total page cache size, we need adjust it */ /* si_swapinfo(&i); */ /* availmem = availmem - (i.totalswap - i.freeswap); */ nomem = (availmem < ipvs->sysctl_amemthresh); local_bh_disable(); /* drop_entry */ spin_lock(&ipvs->dropentry_lock); switch (ipvs->sysctl_drop_entry) { case 0: atomic_set(&ipvs->dropentry, 0); break; case 1: if (nomem) { atomic_set(&ipvs->dropentry, 1); ipvs->sysctl_drop_entry = 2; } else { atomic_set(&ipvs->dropentry, 0); } break; case 2: if (nomem) { atomic_set(&ipvs->dropentry, 1); } else { atomic_set(&ipvs->dropentry, 0); ipvs->sysctl_drop_entry = 1; } break; case 3: atomic_set(&ipvs->dropentry, 1); break; } spin_unlock(&ipvs->dropentry_lock); /* drop_packet */ spin_lock(&ipvs->droppacket_lock); switch (ipvs->sysctl_drop_packet) { case 0: ipvs->drop_rate = 0; break; case 1: if (nomem) { ipvs->drop_rate = ipvs->drop_counter = ipvs->sysctl_amemthresh / (ipvs->sysctl_amemthresh-availmem); ipvs->sysctl_drop_packet = 2; } else { ipvs->drop_rate = 0; } break; case 2: if (nomem) { ipvs->drop_rate = ipvs->drop_counter = ipvs->sysctl_amemthresh / (ipvs->sysctl_amemthresh-availmem); } else { ipvs->drop_rate = 0; ipvs->sysctl_drop_packet = 1; } break; case 3: ipvs->drop_rate = ipvs->sysctl_am_droprate; break; } spin_unlock(&ipvs->droppacket_lock); /* secure_tcp */ spin_lock(&ipvs->securetcp_lock); switch (ipvs->sysctl_secure_tcp) { case 0: if (ipvs->old_secure_tcp >= 2) to_change = 0; break; case 1: if (nomem) { if (ipvs->old_secure_tcp < 2) to_change = 1; ipvs->sysctl_secure_tcp = 2; } else { if (ipvs->old_secure_tcp >= 2) to_change = 0; } break; case 2: if (nomem) { if (ipvs->old_secure_tcp < 2) to_change = 1; } else { if (ipvs->old_secure_tcp >= 2) to_change = 0; ipvs->sysctl_secure_tcp = 1; } break; case 3: if (ipvs->old_secure_tcp < 2) to_change = 1; break; } ipvs->old_secure_tcp = ipvs->sysctl_secure_tcp; if (to_change >= 0) ip_vs_protocol_timeout_change(ipvs, ipvs->sysctl_secure_tcp > 1); spin_unlock(&ipvs->securetcp_lock); local_bh_enable(); } /* Handler for delayed work for expiring no * destination connections */ static void expire_nodest_conn_handler(struct work_struct *work) { struct netns_ipvs *ipvs; ipvs = container_of(work, struct netns_ipvs, expire_nodest_conn_work.work); ip_vs_expire_nodest_conn_flush(ipvs); } /* * Timer for checking the defense */ #define DEFENSE_TIMER_PERIOD 1*HZ static void defense_work_handler(struct work_struct *work) { struct netns_ipvs *ipvs = container_of(work, struct netns_ipvs, defense_work.work); update_defense_level(ipvs); if (atomic_read(&ipvs->dropentry)) ip_vs_random_dropentry(ipvs); queue_delayed_work(system_long_wq, &ipvs->defense_work, DEFENSE_TIMER_PERIOD); } #endif static void est_reload_work_handler(struct work_struct *work) { struct netns_ipvs *ipvs = container_of(work, struct netns_ipvs, est_reload_work.work); int genid_done = atomic_read(&ipvs->est_genid_done); unsigned long delay = HZ / 10; /* repeat startups after failure */ bool repeat = false; int genid; int id; mutex_lock(&ipvs->est_mutex); genid = atomic_read(&ipvs->est_genid); for (id = 0; id < ipvs->est_kt_count; id++) { struct ip_vs_est_kt_data *kd = ipvs->est_kt_arr[id]; /* netns clean up started, abort delayed work */ if (!ipvs->enable) goto unlock; if (!kd) continue; /* New config ? Stop kthread tasks */ if (genid != genid_done) ip_vs_est_kthread_stop(kd); if (!kd->task && !ip_vs_est_stopped(ipvs)) { /* Do not start kthreads above 0 in calc phase */ if ((!id || !ipvs->est_calc_phase) && ip_vs_est_kthread_start(ipvs, kd) < 0) repeat = true; } } atomic_set(&ipvs->est_genid_done, genid); if (repeat) queue_delayed_work(system_long_wq, &ipvs->est_reload_work, delay); unlock: mutex_unlock(&ipvs->est_mutex); } int ip_vs_use_count_inc(void) { return try_module_get(THIS_MODULE); } void ip_vs_use_count_dec(void) { module_put(THIS_MODULE); } /* * Hash table: for virtual service lookups */ #define IP_VS_SVC_TAB_BITS 8 #define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS) #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1) /* the service table hashed by <protocol, addr, port> */ static struct hlist_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; /* the service table hashed by fwmark */ static struct hlist_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; /* * Returns hash value for virtual service */ static inline unsigned int ip_vs_svc_hashkey(struct netns_ipvs *ipvs, int af, unsigned int proto, const union nf_inet_addr *addr, __be16 port) { unsigned int porth = ntohs(port); __be32 addr_fold = addr->ip; __u32 ahash; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif ahash = ntohl(addr_fold); ahash ^= ((size_t) ipvs >> 8); return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) & IP_VS_SVC_TAB_MASK; } /* * Returns hash value of fwmark for virtual service lookup */ static inline unsigned int ip_vs_svc_fwm_hashkey(struct netns_ipvs *ipvs, __u32 fwmark) { return (((size_t)ipvs>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK; } /* * Hashes a service in the ip_vs_svc_table by <netns,proto,addr,port> * or in the ip_vs_svc_fwm_table by fwmark. * Should be called with locked tables. */ static int ip_vs_svc_hash(struct ip_vs_service *svc) { unsigned int hash; if (svc->flags & IP_VS_SVC_F_HASHED) { pr_err("%s(): request for already hashed, called from %pS\n", __func__, __builtin_return_address(0)); return 0; } if (svc->fwmark == 0) { /* * Hash it by <netns,protocol,addr,port> in ip_vs_svc_table */ hash = ip_vs_svc_hashkey(svc->ipvs, svc->af, svc->protocol, &svc->addr, svc->port); hlist_add_head_rcu(&svc->s_list, &ip_vs_svc_table[hash]); } else { /* * Hash it by fwmark in svc_fwm_table */ hash = ip_vs_svc_fwm_hashkey(svc->ipvs, svc->fwmark); hlist_add_head_rcu(&svc->f_list, &ip_vs_svc_fwm_table[hash]); } svc->flags |= IP_VS_SVC_F_HASHED; /* increase its refcnt because it is referenced by the svc table */ atomic_inc(&svc->refcnt); return 1; } /* * Unhashes a service from svc_table / svc_fwm_table. * Should be called with locked tables. */ static int ip_vs_svc_unhash(struct ip_vs_service *svc) { if (!(svc->flags & IP_VS_SVC_F_HASHED)) { pr_err("%s(): request for unhash flagged, called from %pS\n", __func__, __builtin_return_address(0)); return 0; } if (svc->fwmark == 0) { /* Remove it from the svc_table table */ hlist_del_rcu(&svc->s_list); } else { /* Remove it from the svc_fwm_table table */ hlist_del_rcu(&svc->f_list); } svc->flags &= ~IP_VS_SVC_F_HASHED; atomic_dec(&svc->refcnt); return 1; } /* * Get service by {netns, proto,addr,port} in the service table. */ static inline struct ip_vs_service * __ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u16 protocol, const union nf_inet_addr *vaddr, __be16 vport) { unsigned int hash; struct ip_vs_service *svc; /* Check for "full" addressed entries */ hash = ip_vs_svc_hashkey(ipvs, af, protocol, vaddr, vport); hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[hash], s_list) { if ((svc->af == af) && ip_vs_addr_equal(af, &svc->addr, vaddr) && (svc->port == vport) && (svc->protocol == protocol) && (svc->ipvs == ipvs)) { /* HIT */ return svc; } } return NULL; } /* * Get service by {fwmark} in the service table. */ static inline struct ip_vs_service * __ip_vs_svc_fwm_find(struct netns_ipvs *ipvs, int af, __u32 fwmark) { unsigned int hash; struct ip_vs_service *svc; /* Check for fwmark addressed entries */ hash = ip_vs_svc_fwm_hashkey(ipvs, fwmark); hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[hash], f_list) { if (svc->fwmark == fwmark && svc->af == af && (svc->ipvs == ipvs)) { /* HIT */ return svc; } } return NULL; } /* Find service, called under RCU lock */ struct ip_vs_service * ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol, const union nf_inet_addr *vaddr, __be16 vport) { struct ip_vs_service *svc; /* * Check the table hashed by fwmark first */ if (fwmark) { svc = __ip_vs_svc_fwm_find(ipvs, af, fwmark); if (svc) goto out; } /* * Check the table hashed by <protocol,addr,port> * for "full" addressed entries */ svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, vport); if (!svc && protocol == IPPROTO_TCP && atomic_read(&ipvs->ftpsvc_counter) && (vport == FTPDATA || !inet_port_requires_bind_service(ipvs->net, ntohs(vport)))) { /* * Check if ftp service entry exists, the packet * might belong to FTP data connections. */ svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, FTPPORT); } if (svc == NULL && atomic_read(&ipvs->nullsvc_counter)) { /* * Check if the catch-all port (port zero) exists */ svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, 0); } out: IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n", fwmark, ip_vs_proto_name(protocol), IP_VS_DBG_ADDR(af, vaddr), ntohs(vport), svc ? "hit" : "not hit"); return svc; } static inline void __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) { atomic_inc(&svc->refcnt); rcu_assign_pointer(dest->svc, svc); } static void ip_vs_service_free(struct ip_vs_service *svc) { ip_vs_stats_release(&svc->stats); kfree(svc); } static void ip_vs_service_rcu_free(struct rcu_head *head) { struct ip_vs_service *svc; svc = container_of(head, struct ip_vs_service, rcu_head); ip_vs_service_free(svc); } static void __ip_vs_svc_put(struct ip_vs_service *svc) { if (atomic_dec_and_test(&svc->refcnt)) { IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n", svc->fwmark, IP_VS_DBG_ADDR(svc->af, &svc->addr), ntohs(svc->port)); call_rcu(&svc->rcu_head, ip_vs_service_rcu_free); } } /* * Returns hash value for real service */ static inline unsigned int ip_vs_rs_hashkey(int af, const union nf_inet_addr *addr, __be16 port) { unsigned int porth = ntohs(port); __be32 addr_fold = addr->ip; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth) & IP_VS_RTAB_MASK; } /* Hash ip_vs_dest in rs_table by <proto,addr,port>. */ static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest) { unsigned int hash; __be16 port; if (dest->in_rs_table) return; switch (IP_VS_DFWD_METHOD(dest)) { case IP_VS_CONN_F_MASQ: port = dest->port; break; case IP_VS_CONN_F_TUNNEL: switch (dest->tun_type) { case IP_VS_CONN_F_TUNNEL_TYPE_GUE: port = dest->tun_port; break; case IP_VS_CONN_F_TUNNEL_TYPE_IPIP: case IP_VS_CONN_F_TUNNEL_TYPE_GRE: port = 0; break; default: return; } break; default: return; } /* * Hash by proto,addr,port, * which are the parameters of the real service. */ hash = ip_vs_rs_hashkey(dest->af, &dest->addr, port); hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]); dest->in_rs_table = 1; } /* Unhash ip_vs_dest from rs_table. */ static void ip_vs_rs_unhash(struct ip_vs_dest *dest) { /* * Remove it from the rs_table table. */ if (dest->in_rs_table) { hlist_del_rcu(&dest->d_list); dest->in_rs_table = 0; } } /* Check if real service by <proto,addr,port> is present */ bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol, const union nf_inet_addr *daddr, __be16 dport) { unsigned int hash; struct ip_vs_dest *dest; /* Check for "full" addressed entries */ hash = ip_vs_rs_hashkey(af, daddr, dport); hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { if (dest->port == dport && dest->af == af && ip_vs_addr_equal(af, &dest->addr, daddr) && (dest->protocol == protocol || dest->vfwmark) && IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) { /* HIT */ return true; } } return false; } /* Find real service record by <proto,addr,port>. * In case of multiple records with the same <proto,addr,port>, only * the first found record is returned. * * To be called under RCU lock. */ struct ip_vs_dest *ip_vs_find_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol, const union nf_inet_addr *daddr, __be16 dport) { unsigned int hash; struct ip_vs_dest *dest; /* Check for "full" addressed entries */ hash = ip_vs_rs_hashkey(af, daddr, dport); hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { if (dest->port == dport && dest->af == af && ip_vs_addr_equal(af, &dest->addr, daddr) && (dest->protocol == protocol || dest->vfwmark) && IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) { /* HIT */ return dest; } } return NULL; } /* Find real service record by <af,addr,tun_port>. * In case of multiple records with the same <af,addr,tun_port>, only * the first found record is returned. * * To be called under RCU lock. */ struct ip_vs_dest *ip_vs_find_tunnel(struct netns_ipvs *ipvs, int af, const union nf_inet_addr *daddr, __be16 tun_port) { struct ip_vs_dest *dest; unsigned int hash; /* Check for "full" addressed entries */ hash = ip_vs_rs_hashkey(af, daddr, tun_port); hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { if (dest->tun_port == tun_port && dest->af == af && ip_vs_addr_equal(af, &dest->addr, daddr) && IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_TUNNEL) { /* HIT */ return dest; } } return NULL; } /* Lookup destination by {addr,port} in the given service * Called under RCU lock. */ static struct ip_vs_dest * ip_vs_lookup_dest(struct ip_vs_service *svc, int dest_af, const union nf_inet_addr *daddr, __be16 dport) { struct ip_vs_dest *dest; /* * Find the destination for the given service */ list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if ((dest->af == dest_af) && ip_vs_addr_equal(dest_af, &dest->addr, daddr) && (dest->port == dport)) { /* HIT */ return dest; } } return NULL; } /* * Find destination by {daddr,dport,vaddr,protocol} * Created to be used in ip_vs_process_message() in * the backup synchronization daemon. It finds the * destination to be bound to the received connection * on the backup. * Called under RCU lock, no refcnt is returned. */ struct ip_vs_dest *ip_vs_find_dest(struct netns_ipvs *ipvs, int svc_af, int dest_af, const union nf_inet_addr *daddr, __be16 dport, const union nf_inet_addr *vaddr, __be16 vport, __u16 protocol, __u32 fwmark, __u32 flags) { struct ip_vs_dest *dest; struct ip_vs_service *svc; __be16 port = dport; svc = ip_vs_service_find(ipvs, svc_af, fwmark, protocol, vaddr, vport); if (!svc) return NULL; if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) port = 0; dest = ip_vs_lookup_dest(svc, dest_af, daddr, port); if (!dest) dest = ip_vs_lookup_dest(svc, dest_af, daddr, port ^ dport); return dest; } void ip_vs_dest_dst_rcu_free(struct rcu_head *head) { struct ip_vs_dest_dst *dest_dst = container_of(head, struct ip_vs_dest_dst, rcu_head); dst_release(dest_dst->dst_cache); kfree(dest_dst); } /* Release dest_dst and dst_cache for dest in user context */ static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest) { struct ip_vs_dest_dst *old; old = rcu_dereference_protected(dest->dest_dst, 1); if (old) { RCU_INIT_POINTER(dest->dest_dst, NULL); call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free); } } /* * Lookup dest by {svc,addr,port} in the destination trash. * The destination trash is used to hold the destinations that are removed * from the service table but are still referenced by some conn entries. * The reason to add the destination trash is when the dest is temporary * down (either by administrator or by monitor program), the dest can be * picked back from the trash, the remaining connections to the dest can * continue, and the counting information of the dest is also useful for * scheduling. */ static struct ip_vs_dest * ip_vs_trash_get_dest(struct ip_vs_service *svc, int dest_af, const union nf_inet_addr *daddr, __be16 dport) { struct ip_vs_dest *dest; struct netns_ipvs *ipvs = svc->ipvs; /* * Find the destination in trash */ spin_lock_bh(&ipvs->dest_trash_lock); list_for_each_entry(dest, &ipvs->dest_trash, t_list) { IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, " "dest->refcnt=%d\n", dest->vfwmark, IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), refcount_read(&dest->refcnt)); if (dest->af == dest_af && ip_vs_addr_equal(dest_af, &dest->addr, daddr) && dest->port == dport && dest->vfwmark == svc->fwmark && dest->protocol == svc->protocol && (svc->fwmark || (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) && dest->vport == svc->port))) { /* HIT */ list_del(&dest->t_list); goto out; } } dest = NULL; out: spin_unlock_bh(&ipvs->dest_trash_lock); return dest; } static void ip_vs_dest_rcu_free(struct rcu_head *head) { struct ip_vs_dest *dest; dest = container_of(head, struct ip_vs_dest, rcu_head); ip_vs_stats_release(&dest->stats); ip_vs_dest_put_and_free(dest); } static void ip_vs_dest_free(struct ip_vs_dest *dest) { struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1); __ip_vs_dst_cache_reset(dest); __ip_vs_svc_put(svc); call_rcu(&dest->rcu_head, ip_vs_dest_rcu_free); } /* * Clean up all the destinations in the trash * Called by the ip_vs_control_cleanup() * * When the ip_vs_control_clearup is activated by ipvs module exit, * the service tables must have been flushed and all the connections * are expired, and the refcnt of each destination in the trash must * be 1, so we simply release them here. */ static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs) { struct ip_vs_dest *dest, *nxt; del_timer_sync(&ipvs->dest_trash_timer); /* No need to use dest_trash_lock */ list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) { list_del(&dest->t_list); ip_vs_dest_free(dest); } } static void ip_vs_stats_rcu_free(struct rcu_head *head) { struct ip_vs_stats_rcu *rs = container_of(head, struct ip_vs_stats_rcu, rcu_head); ip_vs_stats_release(&rs->s); kfree(rs); } static void ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src) { #define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->kstats.c - src->kstats0.c spin_lock(&src->lock); IP_VS_SHOW_STATS_COUNTER(conns); IP_VS_SHOW_STATS_COUNTER(inpkts); IP_VS_SHOW_STATS_COUNTER(outpkts); IP_VS_SHOW_STATS_COUNTER(inbytes); IP_VS_SHOW_STATS_COUNTER(outbytes); ip_vs_read_estimator(dst, src); spin_unlock(&src->lock); } static void ip_vs_export_stats_user(struct ip_vs_stats_user *dst, struct ip_vs_kstats *src) { dst->conns = (u32)src->conns; dst->inpkts = (u32)src->inpkts; dst->outpkts = (u32)src->outpkts; dst->inbytes = src->inbytes; dst->outbytes = src->outbytes; dst->cps = (u32)src->cps; dst->inpps = (u32)src->inpps; dst->outpps = (u32)src->outpps; dst->inbps = (u32)src->inbps; dst->outbps = (u32)src->outbps; } static void ip_vs_zero_stats(struct ip_vs_stats *stats) { spin_lock(&stats->lock); /* get current counters as zero point, rates are zeroed */ #define IP_VS_ZERO_STATS_COUNTER(c) stats->kstats0.c = stats->kstats.c IP_VS_ZERO_STATS_COUNTER(conns); IP_VS_ZERO_STATS_COUNTER(inpkts); IP_VS_ZERO_STATS_COUNTER(outpkts); IP_VS_ZERO_STATS_COUNTER(inbytes); IP_VS_ZERO_STATS_COUNTER(outbytes); ip_vs_zero_estimator(stats); spin_unlock(&stats->lock); } /* Allocate fields after kzalloc */ int ip_vs_stats_init_alloc(struct ip_vs_stats *s) { int i; spin_lock_init(&s->lock); s->cpustats = alloc_percpu(struct ip_vs_cpu_stats); if (!s->cpustats) return -ENOMEM; for_each_possible_cpu(i) { struct ip_vs_cpu_stats *cs = per_cpu_ptr(s->cpustats, i); u64_stats_init(&cs->syncp); } return 0; } struct ip_vs_stats *ip_vs_stats_alloc(void) { struct ip_vs_stats *s = kzalloc(sizeof(*s), GFP_KERNEL); if (s && ip_vs_stats_init_alloc(s) >= 0) return s; kfree(s); return NULL; } void ip_vs_stats_release(struct ip_vs_stats *stats) { free_percpu(stats->cpustats); } void ip_vs_stats_free(struct ip_vs_stats *stats) { if (stats) { ip_vs_stats_release(stats); kfree(stats); } } /* * Update a destination in the given service */ static void __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest, int add) { struct netns_ipvs *ipvs = svc->ipvs; struct ip_vs_service *old_svc; struct ip_vs_scheduler *sched; int conn_flags; /* We cannot modify an address and change the address family */ BUG_ON(!add && udest->af != dest->af); if (add && udest->af != svc->af) ipvs->mixed_address_family_dests++; /* keep the last_weight with latest non-0 weight */ if (add || udest->weight != 0) atomic_set(&dest->last_weight, udest->weight); /* set the weight and the flags */ atomic_set(&dest->weight, udest->weight); conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK; conn_flags |= IP_VS_CONN_F_INACTIVE; /* Need to rehash? */ if ((udest->conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_DFWD_METHOD(dest) || udest->tun_type != dest->tun_type || udest->tun_port != dest->tun_port) ip_vs_rs_unhash(dest); /* set the tunnel info */ dest->tun_type = udest->tun_type; dest->tun_port = udest->tun_port; dest->tun_flags = udest->tun_flags; /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) { conn_flags |= IP_VS_CONN_F_NOOUTPUT; } else { /* FTP-NAT requires conntrack for mangling */ if (svc->port == FTPPORT) ip_vs_register_conntrack(svc); } atomic_set(&dest->conn_flags, conn_flags); /* Put the real service in rs_table if not present. */ ip_vs_rs_hash(ipvs, dest); /* bind the service */ old_svc = rcu_dereference_protected(dest->svc, 1); if (!old_svc) { __ip_vs_bind_svc(dest, svc); } else { if (old_svc != svc) { ip_vs_zero_stats(&dest->stats); __ip_vs_bind_svc(dest, svc); __ip_vs_svc_put(old_svc); } } /* set the dest status flags */ dest->flags |= IP_VS_DEST_F_AVAILABLE; if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold) dest->flags &= ~IP_VS_DEST_F_OVERLOAD; dest->u_threshold = udest->u_threshold; dest->l_threshold = udest->l_threshold; dest->af = udest->af; spin_lock_bh(&dest->dst_lock); __ip_vs_dst_cache_reset(dest); spin_unlock_bh(&dest->dst_lock); if (add) { list_add_rcu(&dest->n_list, &svc->destinations); svc->num_dests++; sched = rcu_dereference_protected(svc->scheduler, 1); if (sched && sched->add_dest) sched->add_dest(svc, dest); } else { sched = rcu_dereference_protected(svc->scheduler, 1); if (sched && sched->upd_dest) sched->upd_dest(svc, dest); } } /* * Create a destination for the given service */ static int ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) { struct ip_vs_dest *dest; unsigned int atype; int ret; #ifdef CONFIG_IP_VS_IPV6 if (udest->af == AF_INET6) { atype = ipv6_addr_type(&udest->addr.in6); if ((!(atype & IPV6_ADDR_UNICAST) || atype & IPV6_ADDR_LINKLOCAL) && !__ip_vs_addr_is_local_v6(svc->ipvs->net, &udest->addr.in6)) return -EINVAL; ret = nf_defrag_ipv6_enable(svc->ipvs->net); if (ret) return ret; } else #endif { atype = inet_addr_type(svc->ipvs->net, udest->addr.ip); if (atype != RTN_LOCAL && atype != RTN_UNICAST) return -EINVAL; } dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL); if (dest == NULL) return -ENOMEM; ret = ip_vs_stats_init_alloc(&dest->stats); if (ret < 0) goto err_alloc; ret = ip_vs_start_estimator(svc->ipvs, &dest->stats); if (ret < 0) goto err_stats; dest->af = udest->af; dest->protocol = svc->protocol; dest->vaddr = svc->addr; dest->vport = svc->port; dest->vfwmark = svc->fwmark; ip_vs_addr_copy(udest->af, &dest->addr, &udest->addr); dest->port = udest->port; atomic_set(&dest->activeconns, 0); atomic_set(&dest->inactconns, 0); atomic_set(&dest->persistconns, 0); refcount_set(&dest->refcnt, 1); INIT_HLIST_NODE(&dest->d_list); spin_lock_init(&dest->dst_lock); __ip_vs_update_dest(svc, dest, udest, 1); return 0; err_stats: ip_vs_stats_release(&dest->stats); err_alloc: kfree(dest); return ret; } /* * Add a destination into an existing service */ static int ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) { struct ip_vs_dest *dest; union nf_inet_addr daddr; __be16 dport = udest->port; int ret; if (udest->weight < 0) { pr_err("%s(): server weight less than zero\n", __func__); return -ERANGE; } if (udest->l_threshold > udest->u_threshold) { pr_err("%s(): lower threshold is higher than upper threshold\n", __func__); return -ERANGE; } if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { if (udest->tun_port == 0) { pr_err("%s(): tunnel port is zero\n", __func__); return -EINVAL; } } ip_vs_addr_copy(udest->af, &daddr, &udest->addr); /* We use function that requires RCU lock */ rcu_read_lock(); dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport); rcu_read_unlock(); if (dest != NULL) { IP_VS_DBG(1, "%s(): dest already exists\n", __func__); return -EEXIST; } /* * Check if the dest already exists in the trash and * is from the same service */ dest = ip_vs_trash_get_dest(svc, udest->af, &daddr, dport); if (dest != NULL) { IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, " "dest->refcnt=%d, service %u/%s:%u\n", IP_VS_DBG_ADDR(udest->af, &daddr), ntohs(dport), refcount_read(&dest->refcnt), dest->vfwmark, IP_VS_DBG_ADDR(svc->af, &dest->vaddr), ntohs(dest->vport)); ret = ip_vs_start_estimator(svc->ipvs, &dest->stats); if (ret < 0) return ret; __ip_vs_update_dest(svc, dest, udest, 1); } else { /* * Allocate and initialize the dest structure */ ret = ip_vs_new_dest(svc, udest); } return ret; } /* * Edit a destination in the given service */ static int ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) { struct ip_vs_dest *dest; union nf_inet_addr daddr; __be16 dport = udest->port; if (udest->weight < 0) { pr_err("%s(): server weight less than zero\n", __func__); return -ERANGE; } if (udest->l_threshold > udest->u_threshold) { pr_err("%s(): lower threshold is higher than upper threshold\n", __func__); return -ERANGE; } if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { if (udest->tun_port == 0) { pr_err("%s(): tunnel port is zero\n", __func__); return -EINVAL; } } ip_vs_addr_copy(udest->af, &daddr, &udest->addr); /* We use function that requires RCU lock */ rcu_read_lock(); dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport); rcu_read_unlock(); if (dest == NULL) { IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__); return -ENOENT; } __ip_vs_update_dest(svc, dest, udest, 0); return 0; } /* * Delete a destination (must be already unlinked from the service) */ static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest, bool cleanup) { ip_vs_stop_estimator(ipvs, &dest->stats); /* * Remove it from the d-linked list with the real services. */ ip_vs_rs_unhash(dest); spin_lock_bh(&ipvs->dest_trash_lock); IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n", IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), refcount_read(&dest->refcnt)); if (list_empty(&ipvs->dest_trash) && !cleanup) mod_timer(&ipvs->dest_trash_timer, jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); /* dest lives in trash with reference */ list_add(&dest->t_list, &ipvs->dest_trash); dest->idle_start = 0; spin_unlock_bh(&ipvs->dest_trash_lock); /* Queue up delayed work to expire all no destination connections. * No-op when CONFIG_SYSCTL is disabled. */ if (!cleanup) ip_vs_enqueue_expire_nodest_conns(ipvs); } /* * Unlink a destination from the given service */ static void __ip_vs_unlink_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, int svcupd) { dest->flags &= ~IP_VS_DEST_F_AVAILABLE; /* * Remove it from the d-linked destination list. */ list_del_rcu(&dest->n_list); svc->num_dests--; if (dest->af != svc->af) svc->ipvs->mixed_address_family_dests--; if (svcupd) { struct ip_vs_scheduler *sched; sched = rcu_dereference_protected(svc->scheduler, 1); if (sched && sched->del_dest) sched->del_dest(svc, dest); } } /* * Delete a destination server in the given service */ static int ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) { struct ip_vs_dest *dest; __be16 dport = udest->port; /* We use function that requires RCU lock */ rcu_read_lock(); dest = ip_vs_lookup_dest(svc, udest->af, &udest->addr, dport); rcu_read_unlock(); if (dest == NULL) { IP_VS_DBG(1, "%s(): destination not found!\n", __func__); return -ENOENT; } /* * Unlink dest from the service */ __ip_vs_unlink_dest(svc, dest, 1); /* * Delete the destination */ __ip_vs_del_dest(svc->ipvs, dest, false); return 0; } static void ip_vs_dest_trash_expire(struct timer_list *t) { struct netns_ipvs *ipvs = from_timer(ipvs, t, dest_trash_timer); struct ip_vs_dest *dest, *next; unsigned long now = jiffies; spin_lock(&ipvs->dest_trash_lock); list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) { if (refcount_read(&dest->refcnt) > 1) continue; if (dest->idle_start) { if (time_before(now, dest->idle_start + IP_VS_DEST_TRASH_PERIOD)) continue; } else { dest->idle_start = max(1UL, now); continue; } IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n", dest->vfwmark, IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); list_del(&dest->t_list); ip_vs_dest_free(dest); } if (!list_empty(&ipvs->dest_trash)) mod_timer(&ipvs->dest_trash_timer, jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); spin_unlock(&ipvs->dest_trash_lock); } /* * Add a service into the service hash table */ static int ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, struct ip_vs_service **svc_p) { int ret = 0; struct ip_vs_scheduler *sched = NULL; struct ip_vs_pe *pe = NULL; struct ip_vs_service *svc = NULL; int ret_hooks = -1; /* increase the module use count */ if (!ip_vs_use_count_inc()) return -ENOPROTOOPT; /* Lookup the scheduler by 'u->sched_name' */ if (strcmp(u->sched_name, "none")) { sched = ip_vs_scheduler_get(u->sched_name); if (!sched) { pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name); ret = -ENOENT; goto out_err; } } if (u->pe_name && *u->pe_name) { pe = ip_vs_pe_getbyname(u->pe_name); if (pe == NULL) { pr_info("persistence engine module ip_vs_pe_%s " "not found\n", u->pe_name); ret = -ENOENT; goto out_err; } } #ifdef CONFIG_IP_VS_IPV6 if (u->af == AF_INET6) { __u32 plen = (__force __u32) u->netmask; if (plen < 1 || plen > 128) { ret = -EINVAL; goto out_err; } ret = nf_defrag_ipv6_enable(ipvs->net); if (ret) goto out_err; } #endif if ((u->af == AF_INET && !ipvs->num_services) || (u->af == AF_INET6 && !ipvs->num_services6)) { ret = ip_vs_register_hooks(ipvs, u->af); if (ret < 0) goto out_err; ret_hooks = ret; } svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL); if (svc == NULL) { IP_VS_DBG(1, "%s(): no memory\n", __func__); ret = -ENOMEM; goto out_err; } ret = ip_vs_stats_init_alloc(&svc->stats); if (ret < 0) goto out_err; /* I'm the first user of the service */ atomic_set(&svc->refcnt, 0); svc->af = u->af; svc->protocol = u->protocol; ip_vs_addr_copy(svc->af, &svc->addr, &u->addr); svc->port = u->port; svc->fwmark = u->fwmark; svc->flags = u->flags & ~IP_VS_SVC_F_HASHED; svc->timeout = u->timeout * HZ; svc->netmask = u->netmask; svc->ipvs = ipvs; INIT_LIST_HEAD(&svc->destinations); spin_lock_init(&svc->sched_lock); /* Bind the scheduler */ if (sched) { ret = ip_vs_bind_scheduler(svc, sched); if (ret) goto out_err; sched = NULL; } ret = ip_vs_start_estimator(ipvs, &svc->stats); if (ret < 0) goto out_err; /* Bind the ct retriever */ RCU_INIT_POINTER(svc->pe, pe); pe = NULL; /* Update the virtual service counters */ if (svc->port == FTPPORT) atomic_inc(&ipvs->ftpsvc_counter); else if (svc->port == 0) atomic_inc(&ipvs->nullsvc_counter); if (svc->pe && svc->pe->conn_out) atomic_inc(&ipvs->conn_out_counter); /* Count only IPv4 services for old get/setsockopt interface */ if (svc->af == AF_INET) ipvs->num_services++; else if (svc->af == AF_INET6) ipvs->num_services6++; /* Hash the service into the service table */ ip_vs_svc_hash(svc); *svc_p = svc; if (!ipvs->enable) { /* Now there is a service - full throttle */ ipvs->enable = 1; /* Start estimation for first time */ ip_vs_est_reload_start(ipvs); } return 0; out_err: if (ret_hooks >= 0) ip_vs_unregister_hooks(ipvs, u->af); if (svc != NULL) { ip_vs_unbind_scheduler(svc, sched); ip_vs_service_free(svc); } ip_vs_scheduler_put(sched); ip_vs_pe_put(pe); /* decrease the module use count */ ip_vs_use_count_dec(); return ret; } /* * Edit a service and bind it with a new scheduler */ static int ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) { struct ip_vs_scheduler *sched = NULL, *old_sched; struct ip_vs_pe *pe = NULL, *old_pe = NULL; int ret = 0; bool new_pe_conn_out, old_pe_conn_out; /* * Lookup the scheduler, by 'u->sched_name' */ if (strcmp(u->sched_name, "none")) { sched = ip_vs_scheduler_get(u->sched_name); if (!sched) { pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name); return -ENOENT; } } old_sched = sched; if (u->pe_name && *u->pe_name) { pe = ip_vs_pe_getbyname(u->pe_name); if (pe == NULL) { pr_info("persistence engine module ip_vs_pe_%s " "not found\n", u->pe_name); ret = -ENOENT; goto out; } old_pe = pe; } #ifdef CONFIG_IP_VS_IPV6 if (u->af == AF_INET6) { __u32 plen = (__force __u32) u->netmask; if (plen < 1 || plen > 128) { ret = -EINVAL; goto out; } } #endif old_sched = rcu_dereference_protected(svc->scheduler, 1); if (sched != old_sched) { if (old_sched) { ip_vs_unbind_scheduler(svc, old_sched); RCU_INIT_POINTER(svc->scheduler, NULL); /* Wait all svc->sched_data users */ synchronize_rcu(); } /* Bind the new scheduler */ if (sched) { ret = ip_vs_bind_scheduler(svc, sched); if (ret) { ip_vs_scheduler_put(sched); goto out; } } } /* * Set the flags and timeout value */ svc->flags = u->flags | IP_VS_SVC_F_HASHED; svc->timeout = u->timeout * HZ; svc->netmask = u->netmask; old_pe = rcu_dereference_protected(svc->pe, 1); if (pe != old_pe) { rcu_assign_pointer(svc->pe, pe); /* check for optional methods in new pe */ new_pe_conn_out = (pe && pe->conn_out) ? true : false; old_pe_conn_out = (old_pe && old_pe->conn_out) ? true : false; if (new_pe_conn_out && !old_pe_conn_out) atomic_inc(&svc->ipvs->conn_out_counter); if (old_pe_conn_out && !new_pe_conn_out) atomic_dec(&svc->ipvs->conn_out_counter); } out: ip_vs_scheduler_put(old_sched); ip_vs_pe_put(old_pe); return ret; } /* * Delete a service from the service list * - The service must be unlinked, unlocked and not referenced! * - We are called under _bh lock */ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) { struct ip_vs_dest *dest, *nxt; struct ip_vs_scheduler *old_sched; struct ip_vs_pe *old_pe; struct netns_ipvs *ipvs = svc->ipvs; if (svc->af == AF_INET) { ipvs->num_services--; if (!ipvs->num_services) ip_vs_unregister_hooks(ipvs, svc->af); } else if (svc->af == AF_INET6) { ipvs->num_services6--; if (!ipvs->num_services6) ip_vs_unregister_hooks(ipvs, svc->af); } ip_vs_stop_estimator(svc->ipvs, &svc->stats); /* Unbind scheduler */ old_sched = rcu_dereference_protected(svc->scheduler, 1); ip_vs_unbind_scheduler(svc, old_sched); ip_vs_scheduler_put(old_sched); /* Unbind persistence engine, keep svc->pe */ old_pe = rcu_dereference_protected(svc->pe, 1); if (old_pe && old_pe->conn_out) atomic_dec(&ipvs->conn_out_counter); ip_vs_pe_put(old_pe); /* * Unlink the whole destination list */ list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) { __ip_vs_unlink_dest(svc, dest, 0); __ip_vs_del_dest(svc->ipvs, dest, cleanup); } /* * Update the virtual service counters */ if (svc->port == FTPPORT) atomic_dec(&ipvs->ftpsvc_counter); else if (svc->port == 0) atomic_dec(&ipvs->nullsvc_counter); /* * Free the service if nobody refers to it */ __ip_vs_svc_put(svc); /* decrease the module use count */ ip_vs_use_count_dec(); } /* * Unlink a service from list and try to delete it if its refcnt reached 0 */ static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup) { ip_vs_unregister_conntrack(svc); /* Hold svc to avoid double release from dest_trash */ atomic_inc(&svc->refcnt); /* * Unhash it from the service table */ ip_vs_svc_unhash(svc); __ip_vs_del_service(svc, cleanup); } /* * Delete a service from the service list */ static int ip_vs_del_service(struct ip_vs_service *svc) { if (svc == NULL) return -EEXIST; ip_vs_unlink_service(svc, false); return 0; } /* * Flush all the virtual services */ static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup) { int idx; struct ip_vs_service *svc; struct hlist_node *n; /* * Flush the service table hashed by <netns,protocol,addr,port> */ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry_safe(svc, n, &ip_vs_svc_table[idx], s_list) { if (svc->ipvs == ipvs) ip_vs_unlink_service(svc, cleanup); } } /* * Flush the service table hashed by fwmark */ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry_safe(svc, n, &ip_vs_svc_fwm_table[idx], f_list) { if (svc->ipvs == ipvs) ip_vs_unlink_service(svc, cleanup); } } return 0; } /* * Delete service by {netns} in the service table. * Called by __ip_vs_batch_cleanup() */ void ip_vs_service_nets_cleanup(struct list_head *net_list) { struct netns_ipvs *ipvs; struct net *net; /* Check for "full" addressed entries */ mutex_lock(&__ip_vs_mutex); list_for_each_entry(net, net_list, exit_list) { ipvs = net_ipvs(net); ip_vs_flush(ipvs, true); } mutex_unlock(&__ip_vs_mutex); } /* Put all references for device (dst_cache) */ static inline void ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev) { struct ip_vs_dest_dst *dest_dst; spin_lock_bh(&dest->dst_lock); dest_dst = rcu_dereference_protected(dest->dest_dst, 1); if (dest_dst && dest_dst->dst_cache->dev == dev) { IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n", dev->name, IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), refcount_read(&dest->refcnt)); __ip_vs_dst_cache_reset(dest); } spin_unlock_bh(&dest->dst_lock); } /* Netdev event receiver * Currently only NETDEV_DOWN is handled to release refs to cached dsts */ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_service *svc; struct ip_vs_dest *dest; unsigned int idx; if (event != NETDEV_DOWN || !ipvs) return NOTIFY_DONE; IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name); mutex_lock(&__ip_vs_mutex); for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { if (svc->ipvs == ipvs) { list_for_each_entry(dest, &svc->destinations, n_list) { ip_vs_forget_dev(dest, dev); } } } hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { if (svc->ipvs == ipvs) { list_for_each_entry(dest, &svc->destinations, n_list) { ip_vs_forget_dev(dest, dev); } } } } spin_lock_bh(&ipvs->dest_trash_lock); list_for_each_entry(dest, &ipvs->dest_trash, t_list) { ip_vs_forget_dev(dest, dev); } spin_unlock_bh(&ipvs->dest_trash_lock); mutex_unlock(&__ip_vs_mutex); return NOTIFY_DONE; } /* * Zero counters in a service or all services */ static int ip_vs_zero_service(struct ip_vs_service *svc) { struct ip_vs_dest *dest; list_for_each_entry(dest, &svc->destinations, n_list) { ip_vs_zero_stats(&dest->stats); } ip_vs_zero_stats(&svc->stats); return 0; } static int ip_vs_zero_all(struct netns_ipvs *ipvs) { int idx; struct ip_vs_service *svc; for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { if (svc->ipvs == ipvs) ip_vs_zero_service(svc); } } for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { if (svc->ipvs == ipvs) ip_vs_zero_service(svc); } } ip_vs_zero_stats(&ipvs->tot_stats->s); return 0; } #ifdef CONFIG_SYSCTL static int proc_do_defense_mode(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct netns_ipvs *ipvs = table->extra2; int *valp = table->data; int val = *valp; int rc; struct ctl_table tmp = { .data = &val, .maxlen = sizeof(int), .mode = table->mode, }; rc = proc_dointvec(&tmp, write, buffer, lenp, ppos); if (write && (*valp != val)) { if (val < 0 || val > 3) { rc = -EINVAL; } else { *valp = val; update_defense_level(ipvs); } } return rc; } static int proc_do_sync_threshold(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct netns_ipvs *ipvs = table->extra2; int *valp = table->data; int val[2]; int rc; struct ctl_table tmp = { .data = &val, .maxlen = table->maxlen, .mode = table->mode, }; mutex_lock(&ipvs->sync_mutex); memcpy(val, valp, sizeof(val)); rc = proc_dointvec(&tmp, write, buffer, lenp, ppos); if (write) { if (val[0] < 0 || val[1] < 0 || (val[0] >= val[1] && val[1])) rc = -EINVAL; else memcpy(valp, val, sizeof(val)); } mutex_unlock(&ipvs->sync_mutex); return rc; } static int proc_do_sync_ports(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int *valp = table->data; int val = *valp; int rc; struct ctl_table tmp = { .data = &val, .maxlen = sizeof(int), .mode = table->mode, }; rc = proc_dointvec(&tmp, write, buffer, lenp, ppos); if (write && (*valp != val)) { if (val < 1 || !is_power_of_2(val)) rc = -EINVAL; else *valp = val; } return rc; } static int ipvs_proc_est_cpumask_set(struct ctl_table *table, void *buffer) { struct netns_ipvs *ipvs = table->extra2; cpumask_var_t *valp = table->data; cpumask_var_t newmask; int ret; if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) return -ENOMEM; ret = cpulist_parse(buffer, newmask); if (ret) goto out; mutex_lock(&ipvs->est_mutex); if (!ipvs->est_cpulist_valid) { if (!zalloc_cpumask_var(valp, GFP_KERNEL)) { ret = -ENOMEM; goto unlock; } ipvs->est_cpulist_valid = 1; } cpumask_and(newmask, newmask, ¤t->cpus_mask); cpumask_copy(*valp, newmask); /* est_max_threads may depend on cpulist size */ ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); ipvs->est_calc_phase = 1; ip_vs_est_reload_start(ipvs); unlock: mutex_unlock(&ipvs->est_mutex); out: free_cpumask_var(newmask); return ret; } static int ipvs_proc_est_cpumask_get(struct ctl_table *table, void *buffer, size_t size) { struct netns_ipvs *ipvs = table->extra2; cpumask_var_t *valp = table->data; struct cpumask *mask; int ret; mutex_lock(&ipvs->est_mutex); if (ipvs->est_cpulist_valid) mask = *valp; else mask = (struct cpumask *)housekeeping_cpumask(HK_TYPE_KTHREAD); ret = scnprintf(buffer, size, "%*pbl\n", cpumask_pr_args(mask)); mutex_unlock(&ipvs->est_mutex); return ret; } static int ipvs_proc_est_cpulist(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; /* Ignore both read and write(append) if *ppos not 0 */ if (*ppos || !*lenp) { *lenp = 0; return 0; } if (write) { /* proc_sys_call_handler() appends terminator */ ret = ipvs_proc_est_cpumask_set(table, buffer); if (ret >= 0) *ppos += *lenp; } else { /* proc_sys_call_handler() allocates 1 byte for terminator */ ret = ipvs_proc_est_cpumask_get(table, buffer, *lenp + 1); if (ret >= 0) { *lenp = ret; *ppos += *lenp; ret = 0; } } return ret; } static int ipvs_proc_est_nice(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct netns_ipvs *ipvs = table->extra2; int *valp = table->data; int val = *valp; int ret; struct ctl_table tmp_table = { .data = &val, .maxlen = sizeof(int), .mode = table->mode, }; ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); if (write && ret >= 0) { if (val < MIN_NICE || val > MAX_NICE) { ret = -EINVAL; } else { mutex_lock(&ipvs->est_mutex); if (*valp != val) { *valp = val; ip_vs_est_reload_start(ipvs); } mutex_unlock(&ipvs->est_mutex); } } return ret; } static int ipvs_proc_run_estimation(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct netns_ipvs *ipvs = table->extra2; int *valp = table->data; int val = *valp; int ret; struct ctl_table tmp_table = { .data = &val, .maxlen = sizeof(int), .mode = table->mode, }; ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); if (write && ret >= 0) { mutex_lock(&ipvs->est_mutex); if (*valp != val) { *valp = val; ip_vs_est_reload_start(ipvs); } mutex_unlock(&ipvs->est_mutex); } return ret; } /* * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/) * Do not change order or insert new entries without * align with netns init in ip_vs_control_net_init() */ static struct ctl_table vs_vars[] = { { .procname = "amemthresh", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "am_droprate", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "drop_entry", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_defense_mode, }, { .procname = "drop_packet", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_defense_mode, }, #ifdef CONFIG_IP_VS_NFCT { .procname = "conntrack", .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, }, #endif { .procname = "secure_tcp", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_defense_mode, }, { .procname = "snat_reroute", .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, }, { .procname = "sync_version", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "sync_ports", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_sync_ports, }, { .procname = "sync_persist_mode", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "sync_qlen_max", .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "sync_sock_size", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "cache_bypass", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "expire_nodest_conn", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "sloppy_tcp", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "sloppy_sctp", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "expire_quiescent_template", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "sync_threshold", .maxlen = sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold), .mode = 0644, .proc_handler = proc_do_sync_threshold, }, { .procname = "sync_refresh_period", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "sync_retries", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_THREE, }, { .procname = "nat_icmp_send", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "pmtu_disc", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "backup_only", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "conn_reuse_mode", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "schedule_icmp", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "ignore_tunneled", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "run_estimation", .maxlen = sizeof(int), .mode = 0644, .proc_handler = ipvs_proc_run_estimation, }, { .procname = "est_cpulist", .maxlen = NR_CPUS, /* unused */ .mode = 0644, .proc_handler = ipvs_proc_est_cpulist, }, { .procname = "est_nice", .maxlen = sizeof(int), .mode = 0644, .proc_handler = ipvs_proc_est_nice, }, #ifdef CONFIG_IP_VS_DEBUG { .procname = "debug_level", .data = &sysctl_ip_vs_debug_level, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, #endif { } }; #endif #ifdef CONFIG_PROC_FS struct ip_vs_iter { struct seq_net_private p; /* Do not move this, netns depends upon it*/ struct hlist_head *table; int bucket; }; /* * Write the contents of the VS rule table to a PROCfs file. * (It is kept just for backward compatibility) */ static inline const char *ip_vs_fwd_name(unsigned int flags) { switch (flags & IP_VS_CONN_F_FWD_MASK) { case IP_VS_CONN_F_LOCALNODE: return "Local"; case IP_VS_CONN_F_TUNNEL: return "Tunnel"; case IP_VS_CONN_F_DROUTE: return "Route"; default: return "Masq"; } } /* Get the Nth entry in the two lists */ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) { struct net *net = seq_file_net(seq); struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_iter *iter = seq->private; int idx; struct ip_vs_service *svc; /* look in hash by protocol */ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[idx], s_list) { if ((svc->ipvs == ipvs) && pos-- == 0) { iter->table = ip_vs_svc_table; iter->bucket = idx; return svc; } } } /* keep looking in fwmark */ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[idx], f_list) { if ((svc->ipvs == ipvs) && pos-- == 0) { iter->table = ip_vs_svc_fwm_table; iter->bucket = idx; return svc; } } } return NULL; } static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { rcu_read_lock(); return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN; } static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct hlist_node *e; struct ip_vs_iter *iter; struct ip_vs_service *svc; ++*pos; if (v == SEQ_START_TOKEN) return ip_vs_info_array(seq,0); svc = v; iter = seq->private; if (iter->table == ip_vs_svc_table) { /* next service in table hashed by protocol */ e = rcu_dereference(hlist_next_rcu(&svc->s_list)); if (e) return hlist_entry(e, struct ip_vs_service, s_list); while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[iter->bucket], s_list) { return svc; } } iter->table = ip_vs_svc_fwm_table; iter->bucket = -1; goto scan_fwmark; } /* next service in hashed by fwmark */ e = rcu_dereference(hlist_next_rcu(&svc->f_list)); if (e) return hlist_entry(e, struct ip_vs_service, f_list); scan_fwmark: while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[iter->bucket], f_list) return svc; } return NULL; } static void ip_vs_info_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static int ip_vs_info_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) { seq_printf(seq, "IP Virtual Server version %d.%d.%d (size=%d)\n", NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size); seq_puts(seq, "Prot LocalAddress:Port Scheduler Flags\n"); seq_puts(seq, " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n"); } else { struct net *net = seq_file_net(seq); struct netns_ipvs *ipvs = net_ipvs(net); const struct ip_vs_service *svc = v; const struct ip_vs_iter *iter = seq->private; const struct ip_vs_dest *dest; struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler); char *sched_name = sched ? sched->name : "none"; if (svc->ipvs != ipvs) return 0; if (iter->table == ip_vs_svc_table) { #ifdef CONFIG_IP_VS_IPV6 if (svc->af == AF_INET6) seq_printf(seq, "%s [%pI6]:%04X %s ", ip_vs_proto_name(svc->protocol), &svc->addr.in6, ntohs(svc->port), sched_name); else #endif seq_printf(seq, "%s %08X:%04X %s %s ", ip_vs_proto_name(svc->protocol), ntohl(svc->addr.ip), ntohs(svc->port), sched_name, (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); } else { seq_printf(seq, "FWM %08X %s %s", svc->fwmark, sched_name, (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); } if (svc->flags & IP_VS_SVC_F_PERSISTENT) seq_printf(seq, "persistent %d %08X\n", svc->timeout, ntohl(svc->netmask)); else seq_putc(seq, '\n'); list_for_each_entry_rcu(dest, &svc->destinations, n_list) { #ifdef CONFIG_IP_VS_IPV6 if (dest->af == AF_INET6) seq_printf(seq, " -> [%pI6]:%04X" " %-7s %-6d %-10d %-10d\n", &dest->addr.in6, ntohs(dest->port), ip_vs_fwd_name(atomic_read(&dest->conn_flags)), atomic_read(&dest->weight), atomic_read(&dest->activeconns), atomic_read(&dest->inactconns)); else #endif seq_printf(seq, " -> %08X:%04X " "%-7s %-6d %-10d %-10d\n", ntohl(dest->addr.ip), ntohs(dest->port), ip_vs_fwd_name(atomic_read(&dest->conn_flags)), atomic_read(&dest->weight), atomic_read(&dest->activeconns), atomic_read(&dest->inactconns)); } } return 0; } static const struct seq_operations ip_vs_info_seq_ops = { .start = ip_vs_info_seq_start, .next = ip_vs_info_seq_next, .stop = ip_vs_info_seq_stop, .show = ip_vs_info_seq_show, }; static int ip_vs_stats_show(struct seq_file *seq, void *v) { struct net *net = seq_file_single_net(seq); struct ip_vs_kstats show; /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ seq_puts(seq, " Total Incoming Outgoing Incoming Outgoing\n"); seq_puts(seq, " Conns Packets Packets Bytes Bytes\n"); ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats->s); seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n\n", (unsigned long long)show.conns, (unsigned long long)show.inpkts, (unsigned long long)show.outpkts, (unsigned long long)show.inbytes, (unsigned long long)show.outbytes); /* 01234567 01234567 01234567 0123456701234567 0123456701234567*/ seq_puts(seq, " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n", (unsigned long long)show.cps, (unsigned long long)show.inpps, (unsigned long long)show.outpps, (unsigned long long)show.inbps, (unsigned long long)show.outbps); return 0; } static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) { struct net *net = seq_file_single_net(seq); struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats->s; struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats; struct ip_vs_kstats kstats; int i; /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ seq_puts(seq, " Total Incoming Outgoing Incoming Outgoing\n"); seq_puts(seq, "CPU Conns Packets Packets Bytes Bytes\n"); for_each_possible_cpu(i) { struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i); unsigned int start; u64 conns, inpkts, outpkts, inbytes, outbytes; do { start = u64_stats_fetch_begin(&u->syncp); conns = u64_stats_read(&u->cnt.conns); inpkts = u64_stats_read(&u->cnt.inpkts); outpkts = u64_stats_read(&u->cnt.outpkts); inbytes = u64_stats_read(&u->cnt.inbytes); outbytes = u64_stats_read(&u->cnt.outbytes); } while (u64_stats_fetch_retry(&u->syncp, start)); seq_printf(seq, "%3X %8LX %8LX %8LX %16LX %16LX\n", i, (u64)conns, (u64)inpkts, (u64)outpkts, (u64)inbytes, (u64)outbytes); } ip_vs_copy_stats(&kstats, tot_stats); seq_printf(seq, " ~ %8LX %8LX %8LX %16LX %16LX\n\n", (unsigned long long)kstats.conns, (unsigned long long)kstats.inpkts, (unsigned long long)kstats.outpkts, (unsigned long long)kstats.inbytes, (unsigned long long)kstats.outbytes); /* ... 01234567 01234567 01234567 0123456701234567 0123456701234567 */ seq_puts(seq, " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); seq_printf(seq, " %8LX %8LX %8LX %16LX %16LX\n", kstats.cps, kstats.inpps, kstats.outpps, kstats.inbps, kstats.outbps); return 0; } #endif /* * Set timeout values for tcp tcpfin udp in the timeout_table. */ static int ip_vs_set_timeout(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u) { #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) struct ip_vs_proto_data *pd; #endif IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n", u->tcp_timeout, u->tcp_fin_timeout, u->udp_timeout); #ifdef CONFIG_IP_VS_PROTO_TCP if (u->tcp_timeout < 0 || u->tcp_timeout > (INT_MAX / HZ) || u->tcp_fin_timeout < 0 || u->tcp_fin_timeout > (INT_MAX / HZ)) { return -EINVAL; } #endif #ifdef CONFIG_IP_VS_PROTO_UDP if (u->udp_timeout < 0 || u->udp_timeout > (INT_MAX / HZ)) return -EINVAL; #endif #ifdef CONFIG_IP_VS_PROTO_TCP if (u->tcp_timeout) { pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] = u->tcp_timeout * HZ; } if (u->tcp_fin_timeout) { pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] = u->tcp_fin_timeout * HZ; } #endif #ifdef CONFIG_IP_VS_PROTO_UDP if (u->udp_timeout) { pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP); pd->timeout_table[IP_VS_UDP_S_NORMAL] = u->udp_timeout * HZ; } #endif return 0; } #define CMDID(cmd) (cmd - IP_VS_BASE_CTL) struct ip_vs_svcdest_user { struct ip_vs_service_user s; struct ip_vs_dest_user d; }; static const unsigned char set_arglen[CMDID(IP_VS_SO_SET_MAX) + 1] = { [CMDID(IP_VS_SO_SET_ADD)] = sizeof(struct ip_vs_service_user), [CMDID(IP_VS_SO_SET_EDIT)] = sizeof(struct ip_vs_service_user), [CMDID(IP_VS_SO_SET_DEL)] = sizeof(struct ip_vs_service_user), [CMDID(IP_VS_SO_SET_ADDDEST)] = sizeof(struct ip_vs_svcdest_user), [CMDID(IP_VS_SO_SET_DELDEST)] = sizeof(struct ip_vs_svcdest_user), [CMDID(IP_VS_SO_SET_EDITDEST)] = sizeof(struct ip_vs_svcdest_user), [CMDID(IP_VS_SO_SET_TIMEOUT)] = sizeof(struct ip_vs_timeout_user), [CMDID(IP_VS_SO_SET_STARTDAEMON)] = sizeof(struct ip_vs_daemon_user), [CMDID(IP_VS_SO_SET_STOPDAEMON)] = sizeof(struct ip_vs_daemon_user), [CMDID(IP_VS_SO_SET_ZERO)] = sizeof(struct ip_vs_service_user), }; union ip_vs_set_arglen { struct ip_vs_service_user field_IP_VS_SO_SET_ADD; struct ip_vs_service_user field_IP_VS_SO_SET_EDIT; struct ip_vs_service_user field_IP_VS_SO_SET_DEL; struct ip_vs_svcdest_user field_IP_VS_SO_SET_ADDDEST; struct ip_vs_svcdest_user field_IP_VS_SO_SET_DELDEST; struct ip_vs_svcdest_user field_IP_VS_SO_SET_EDITDEST; struct ip_vs_timeout_user field_IP_VS_SO_SET_TIMEOUT; struct ip_vs_daemon_user field_IP_VS_SO_SET_STARTDAEMON; struct ip_vs_daemon_user field_IP_VS_SO_SET_STOPDAEMON; struct ip_vs_service_user field_IP_VS_SO_SET_ZERO; }; #define MAX_SET_ARGLEN sizeof(union ip_vs_set_arglen) static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc, struct ip_vs_service_user *usvc_compat) { memset(usvc, 0, sizeof(*usvc)); usvc->af = AF_INET; usvc->protocol = usvc_compat->protocol; usvc->addr.ip = usvc_compat->addr; usvc->port = usvc_compat->port; usvc->fwmark = usvc_compat->fwmark; /* Deep copy of sched_name is not needed here */ usvc->sched_name = usvc_compat->sched_name; usvc->flags = usvc_compat->flags; usvc->timeout = usvc_compat->timeout; usvc->netmask = usvc_compat->netmask; } static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest, struct ip_vs_dest_user *udest_compat) { memset(udest, 0, sizeof(*udest)); udest->addr.ip = udest_compat->addr; udest->port = udest_compat->port; udest->conn_flags = udest_compat->conn_flags; udest->weight = udest_compat->weight; udest->u_threshold = udest_compat->u_threshold; udest->l_threshold = udest_compat->l_threshold; udest->af = AF_INET; udest->tun_type = IP_VS_CONN_F_TUNNEL_TYPE_IPIP; } static int do_ip_vs_set_ctl(struct sock *sk, int cmd, sockptr_t ptr, unsigned int len) { struct net *net = sock_net(sk); int ret; unsigned char arg[MAX_SET_ARGLEN]; struct ip_vs_service_user *usvc_compat; struct ip_vs_service_user_kern usvc; struct ip_vs_service *svc; struct ip_vs_dest_user *udest_compat; struct ip_vs_dest_user_kern udest; struct netns_ipvs *ipvs = net_ipvs(net); BUILD_BUG_ON(sizeof(arg) > 255); if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX) return -EINVAL; if (len != set_arglen[CMDID(cmd)]) { IP_VS_DBG(1, "set_ctl: len %u != %u\n", len, set_arglen[CMDID(cmd)]); return -EINVAL; } if (copy_from_sockptr(arg, ptr, len) != 0) return -EFAULT; /* Handle daemons since they have another lock */ if (cmd == IP_VS_SO_SET_STARTDAEMON || cmd == IP_VS_SO_SET_STOPDAEMON) { struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; if (cmd == IP_VS_SO_SET_STARTDAEMON) { struct ipvs_sync_daemon_cfg cfg; memset(&cfg, 0, sizeof(cfg)); ret = -EINVAL; if (strscpy(cfg.mcast_ifn, dm->mcast_ifn, sizeof(cfg.mcast_ifn)) <= 0) return ret; cfg.syncid = dm->syncid; ret = start_sync_thread(ipvs, &cfg, dm->state); } else { ret = stop_sync_thread(ipvs, dm->state); } return ret; } mutex_lock(&__ip_vs_mutex); if (cmd == IP_VS_SO_SET_FLUSH) { /* Flush the virtual service */ ret = ip_vs_flush(ipvs, false); goto out_unlock; } else if (cmd == IP_VS_SO_SET_TIMEOUT) { /* Set timeout values for (tcp tcpfin udp) */ ret = ip_vs_set_timeout(ipvs, (struct ip_vs_timeout_user *)arg); goto out_unlock; } else if (!len) { /* No more commands with len == 0 below */ ret = -EINVAL; goto out_unlock; } usvc_compat = (struct ip_vs_service_user *)arg; udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1); /* We only use the new structs internally, so copy userspace compat * structs to extended internal versions */ ip_vs_copy_usvc_compat(&usvc, usvc_compat); ip_vs_copy_udest_compat(&udest, udest_compat); if (cmd == IP_VS_SO_SET_ZERO) { /* if no service address is set, zero counters in all */ if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) { ret = ip_vs_zero_all(ipvs); goto out_unlock; } } if ((cmd == IP_VS_SO_SET_ADD || cmd == IP_VS_SO_SET_EDIT) && strnlen(usvc.sched_name, IP_VS_SCHEDNAME_MAXLEN) == IP_VS_SCHEDNAME_MAXLEN) { ret = -EINVAL; goto out_unlock; } /* Check for valid protocol: TCP or UDP or SCTP, even for fwmark!=0 */ if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP && usvc.protocol != IPPROTO_SCTP) { pr_err("set_ctl: invalid protocol: %d %pI4:%d\n", usvc.protocol, &usvc.addr.ip, ntohs(usvc.port)); ret = -EFAULT; goto out_unlock; } /* Lookup the exact service by <protocol, addr, port> or fwmark */ rcu_read_lock(); if (usvc.fwmark == 0) svc = __ip_vs_service_find(ipvs, usvc.af, usvc.protocol, &usvc.addr, usvc.port); else svc = __ip_vs_svc_fwm_find(ipvs, usvc.af, usvc.fwmark); rcu_read_unlock(); if (cmd != IP_VS_SO_SET_ADD && (svc == NULL || svc->protocol != usvc.protocol)) { ret = -ESRCH; goto out_unlock; } switch (cmd) { case IP_VS_SO_SET_ADD: if (svc != NULL) ret = -EEXIST; else ret = ip_vs_add_service(ipvs, &usvc, &svc); break; case IP_VS_SO_SET_EDIT: ret = ip_vs_edit_service(svc, &usvc); break; case IP_VS_SO_SET_DEL: ret = ip_vs_del_service(svc); if (!ret) goto out_unlock; break; case IP_VS_SO_SET_ZERO: ret = ip_vs_zero_service(svc); break; case IP_VS_SO_SET_ADDDEST: ret = ip_vs_add_dest(svc, &udest); break; case IP_VS_SO_SET_EDITDEST: ret = ip_vs_edit_dest(svc, &udest); break; case IP_VS_SO_SET_DELDEST: ret = ip_vs_del_dest(svc, &udest); break; default: WARN_ON_ONCE(1); ret = -EINVAL; break; } out_unlock: mutex_unlock(&__ip_vs_mutex); return ret; } static void ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) { struct ip_vs_scheduler *sched; struct ip_vs_kstats kstats; char *sched_name; sched = rcu_dereference_protected(src->scheduler, 1); sched_name = sched ? sched->name : "none"; dst->protocol = src->protocol; dst->addr = src->addr.ip; dst->port = src->port; dst->fwmark = src->fwmark; strscpy(dst->sched_name, sched_name, sizeof(dst->sched_name)); dst->flags = src->flags; dst->timeout = src->timeout / HZ; dst->netmask = src->netmask; dst->num_dests = src->num_dests; ip_vs_copy_stats(&kstats, &src->stats); ip_vs_export_stats_user(&dst->stats, &kstats); } static inline int __ip_vs_get_service_entries(struct netns_ipvs *ipvs, const struct ip_vs_get_services *get, struct ip_vs_get_services __user *uptr) { int idx, count=0; struct ip_vs_service *svc; struct ip_vs_service_entry entry; int ret = 0; for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { /* Only expose IPv4 entries to old interface */ if (svc->af != AF_INET || (svc->ipvs != ipvs)) continue; if (count >= get->num_services) goto out; memset(&entry, 0, sizeof(entry)); ip_vs_copy_service(&entry, svc); if (copy_to_user(&uptr->entrytable[count], &entry, sizeof(entry))) { ret = -EFAULT; goto out; } count++; } } for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { /* Only expose IPv4 entries to old interface */ if (svc->af != AF_INET || (svc->ipvs != ipvs)) continue; if (count >= get->num_services) goto out; memset(&entry, 0, sizeof(entry)); ip_vs_copy_service(&entry, svc); if (copy_to_user(&uptr->entrytable[count], &entry, sizeof(entry))) { ret = -EFAULT; goto out; } count++; } } out: return ret; } static inline int __ip_vs_get_dest_entries(struct netns_ipvs *ipvs, const struct ip_vs_get_dests *get, struct ip_vs_get_dests __user *uptr) { struct ip_vs_service *svc; union nf_inet_addr addr = { .ip = get->addr }; int ret = 0; rcu_read_lock(); if (get->fwmark) svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, get->fwmark); else svc = __ip_vs_service_find(ipvs, AF_INET, get->protocol, &addr, get->port); rcu_read_unlock(); if (svc) { int count = 0; struct ip_vs_dest *dest; struct ip_vs_dest_entry entry; struct ip_vs_kstats kstats; memset(&entry, 0, sizeof(entry)); list_for_each_entry(dest, &svc->destinations, n_list) { if (count >= get->num_dests) break; /* Cannot expose heterogeneous members via sockopt * interface */ if (dest->af != svc->af) continue; entry.addr = dest->addr.ip; entry.port = dest->port; entry.conn_flags = atomic_read(&dest->conn_flags); entry.weight = atomic_read(&dest->weight); entry.u_threshold = dest->u_threshold; entry.l_threshold = dest->l_threshold; entry.activeconns = atomic_read(&dest->activeconns); entry.inactconns = atomic_read(&dest->inactconns); entry.persistconns = atomic_read(&dest->persistconns); ip_vs_copy_stats(&kstats, &dest->stats); ip_vs_export_stats_user(&entry.stats, &kstats); if (copy_to_user(&uptr->entrytable[count], &entry, sizeof(entry))) { ret = -EFAULT; break; } count++; } } else ret = -ESRCH; return ret; } static inline void __ip_vs_get_timeouts(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u) { #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) struct ip_vs_proto_data *pd; #endif memset(u, 0, sizeof (*u)); #ifdef CONFIG_IP_VS_PROTO_TCP pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ; #endif #ifdef CONFIG_IP_VS_PROTO_UDP pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP); u->udp_timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ; #endif } static const unsigned char get_arglen[CMDID(IP_VS_SO_GET_MAX) + 1] = { [CMDID(IP_VS_SO_GET_VERSION)] = 64, [CMDID(IP_VS_SO_GET_INFO)] = sizeof(struct ip_vs_getinfo), [CMDID(IP_VS_SO_GET_SERVICES)] = sizeof(struct ip_vs_get_services), [CMDID(IP_VS_SO_GET_SERVICE)] = sizeof(struct ip_vs_service_entry), [CMDID(IP_VS_SO_GET_DESTS)] = sizeof(struct ip_vs_get_dests), [CMDID(IP_VS_SO_GET_TIMEOUT)] = sizeof(struct ip_vs_timeout_user), [CMDID(IP_VS_SO_GET_DAEMON)] = 2 * sizeof(struct ip_vs_daemon_user), }; union ip_vs_get_arglen { char field_IP_VS_SO_GET_VERSION[64]; struct ip_vs_getinfo field_IP_VS_SO_GET_INFO; struct ip_vs_get_services field_IP_VS_SO_GET_SERVICES; struct ip_vs_service_entry field_IP_VS_SO_GET_SERVICE; struct ip_vs_get_dests field_IP_VS_SO_GET_DESTS; struct ip_vs_timeout_user field_IP_VS_SO_GET_TIMEOUT; struct ip_vs_daemon_user field_IP_VS_SO_GET_DAEMON[2]; }; #define MAX_GET_ARGLEN sizeof(union ip_vs_get_arglen) static int do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { unsigned char arg[MAX_GET_ARGLEN]; int ret = 0; unsigned int copylen; struct net *net = sock_net(sk); struct netns_ipvs *ipvs = net_ipvs(net); BUG_ON(!net); BUILD_BUG_ON(sizeof(arg) > 255); if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX) return -EINVAL; copylen = get_arglen[CMDID(cmd)]; if (*len < (int) copylen) { IP_VS_DBG(1, "get_ctl: len %d < %u\n", *len, copylen); return -EINVAL; } if (copy_from_user(arg, user, copylen) != 0) return -EFAULT; /* * Handle daemons first since it has its own locking */ if (cmd == IP_VS_SO_GET_DAEMON) { struct ip_vs_daemon_user d[2]; memset(&d, 0, sizeof(d)); mutex_lock(&ipvs->sync_mutex); if (ipvs->sync_state & IP_VS_STATE_MASTER) { d[0].state = IP_VS_STATE_MASTER; strscpy(d[0].mcast_ifn, ipvs->mcfg.mcast_ifn, sizeof(d[0].mcast_ifn)); d[0].syncid = ipvs->mcfg.syncid; } if (ipvs->sync_state & IP_VS_STATE_BACKUP) { d[1].state = IP_VS_STATE_BACKUP; strscpy(d[1].mcast_ifn, ipvs->bcfg.mcast_ifn, sizeof(d[1].mcast_ifn)); d[1].syncid = ipvs->bcfg.syncid; } if (copy_to_user(user, &d, sizeof(d)) != 0) ret = -EFAULT; mutex_unlock(&ipvs->sync_mutex); return ret; } mutex_lock(&__ip_vs_mutex); switch (cmd) { case IP_VS_SO_GET_VERSION: { char buf[64]; sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)", NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size); if (copy_to_user(user, buf, strlen(buf)+1) != 0) { ret = -EFAULT; goto out; } *len = strlen(buf)+1; } break; case IP_VS_SO_GET_INFO: { struct ip_vs_getinfo info; info.version = IP_VS_VERSION_CODE; info.size = ip_vs_conn_tab_size; info.num_services = ipvs->num_services; if (copy_to_user(user, &info, sizeof(info)) != 0) ret = -EFAULT; } break; case IP_VS_SO_GET_SERVICES: { struct ip_vs_get_services *get; int size; get = (struct ip_vs_get_services *)arg; size = struct_size(get, entrytable, get->num_services); if (*len != size) { pr_err("length: %u != %u\n", *len, size); ret = -EINVAL; goto out; } ret = __ip_vs_get_service_entries(ipvs, get, user); } break; case IP_VS_SO_GET_SERVICE: { struct ip_vs_service_entry *entry; struct ip_vs_service *svc; union nf_inet_addr addr; entry = (struct ip_vs_service_entry *)arg; addr.ip = entry->addr; rcu_read_lock(); if (entry->fwmark) svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, entry->fwmark); else svc = __ip_vs_service_find(ipvs, AF_INET, entry->protocol, &addr, entry->port); rcu_read_unlock(); if (svc) { ip_vs_copy_service(entry, svc); if (copy_to_user(user, entry, sizeof(*entry)) != 0) ret = -EFAULT; } else ret = -ESRCH; } break; case IP_VS_SO_GET_DESTS: { struct ip_vs_get_dests *get; int size; get = (struct ip_vs_get_dests *)arg; size = struct_size(get, entrytable, get->num_dests); if (*len != size) { pr_err("length: %u != %u\n", *len, size); ret = -EINVAL; goto out; } ret = __ip_vs_get_dest_entries(ipvs, get, user); } break; case IP_VS_SO_GET_TIMEOUT: { struct ip_vs_timeout_user t; __ip_vs_get_timeouts(ipvs, &t); if (copy_to_user(user, &t, sizeof(t)) != 0) ret = -EFAULT; } break; default: ret = -EINVAL; } out: mutex_unlock(&__ip_vs_mutex); return ret; } static struct nf_sockopt_ops ip_vs_sockopts = { .pf = PF_INET, .set_optmin = IP_VS_BASE_CTL, .set_optmax = IP_VS_SO_SET_MAX+1, .set = do_ip_vs_set_ctl, .get_optmin = IP_VS_BASE_CTL, .get_optmax = IP_VS_SO_GET_MAX+1, .get = do_ip_vs_get_ctl, .owner = THIS_MODULE, }; /* * Generic Netlink interface */ /* IPVS genetlink family */ static struct genl_family ip_vs_genl_family; /* Policy used for first-level command attributes */ static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = { [IPVS_CMD_ATTR_SERVICE] = { .type = NLA_NESTED }, [IPVS_CMD_ATTR_DEST] = { .type = NLA_NESTED }, [IPVS_CMD_ATTR_DAEMON] = { .type = NLA_NESTED }, [IPVS_CMD_ATTR_TIMEOUT_TCP] = { .type = NLA_U32 }, [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 }, [IPVS_CMD_ATTR_TIMEOUT_UDP] = { .type = NLA_U32 }, }; /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */ static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = { [IPVS_DAEMON_ATTR_STATE] = { .type = NLA_U32 }, [IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING, .len = IP_VS_IFNAME_MAXLEN - 1 }, [IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 }, [IPVS_DAEMON_ATTR_SYNC_MAXLEN] = { .type = NLA_U16 }, [IPVS_DAEMON_ATTR_MCAST_GROUP] = { .type = NLA_U32 }, [IPVS_DAEMON_ATTR_MCAST_GROUP6] = { .len = sizeof(struct in6_addr) }, [IPVS_DAEMON_ATTR_MCAST_PORT] = { .type = NLA_U16 }, [IPVS_DAEMON_ATTR_MCAST_TTL] = { .type = NLA_U8 }, }; /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */ static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = { [IPVS_SVC_ATTR_AF] = { .type = NLA_U16 }, [IPVS_SVC_ATTR_PROTOCOL] = { .type = NLA_U16 }, [IPVS_SVC_ATTR_ADDR] = { .type = NLA_BINARY, .len = sizeof(union nf_inet_addr) }, [IPVS_SVC_ATTR_PORT] = { .type = NLA_U16 }, [IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 }, [IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING, .len = IP_VS_SCHEDNAME_MAXLEN - 1 }, [IPVS_SVC_ATTR_PE_NAME] = { .type = NLA_NUL_STRING, .len = IP_VS_PENAME_MAXLEN }, [IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY, .len = sizeof(struct ip_vs_flags) }, [IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPVS_SVC_ATTR_NETMASK] = { .type = NLA_U32 }, [IPVS_SVC_ATTR_STATS] = { .type = NLA_NESTED }, }; /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */ static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = { [IPVS_DEST_ATTR_ADDR] = { .type = NLA_BINARY, .len = sizeof(union nf_inet_addr) }, [IPVS_DEST_ATTR_PORT] = { .type = NLA_U16 }, [IPVS_DEST_ATTR_FWD_METHOD] = { .type = NLA_U32 }, [IPVS_DEST_ATTR_WEIGHT] = { .type = NLA_U32 }, [IPVS_DEST_ATTR_U_THRESH] = { .type = NLA_U32 }, [IPVS_DEST_ATTR_L_THRESH] = { .type = NLA_U32 }, [IPVS_DEST_ATTR_ACTIVE_CONNS] = { .type = NLA_U32 }, [IPVS_DEST_ATTR_INACT_CONNS] = { .type = NLA_U32 }, [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 }, [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED }, [IPVS_DEST_ATTR_ADDR_FAMILY] = { .type = NLA_U16 }, [IPVS_DEST_ATTR_TUN_TYPE] = { .type = NLA_U8 }, [IPVS_DEST_ATTR_TUN_PORT] = { .type = NLA_U16 }, [IPVS_DEST_ATTR_TUN_FLAGS] = { .type = NLA_U16 }, }; static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, struct ip_vs_kstats *kstats) { struct nlattr *nl_stats = nla_nest_start_noflag(skb, container_type); if (!nl_stats) return -EMSGSIZE; if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, (u32)kstats->conns) || nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, (u32)kstats->inpkts) || nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, (u32)kstats->outpkts) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes, IPVS_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes, IPVS_STATS_ATTR_PAD) || nla_put_u32(skb, IPVS_STATS_ATTR_CPS, (u32)kstats->cps) || nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, (u32)kstats->inpps) || nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, (u32)kstats->outpps) || nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, (u32)kstats->inbps) || nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, (u32)kstats->outbps)) goto nla_put_failure; nla_nest_end(skb, nl_stats); return 0; nla_put_failure: nla_nest_cancel(skb, nl_stats); return -EMSGSIZE; } static int ip_vs_genl_fill_stats64(struct sk_buff *skb, int container_type, struct ip_vs_kstats *kstats) { struct nlattr *nl_stats = nla_nest_start_noflag(skb, container_type); if (!nl_stats) return -EMSGSIZE; if (nla_put_u64_64bit(skb, IPVS_STATS_ATTR_CONNS, kstats->conns, IPVS_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INPKTS, kstats->inpkts, IPVS_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTPKTS, kstats->outpkts, IPVS_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes, IPVS_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes, IPVS_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_CPS, kstats->cps, IPVS_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INPPS, kstats->inpps, IPVS_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTPPS, kstats->outpps, IPVS_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBPS, kstats->inbps, IPVS_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBPS, kstats->outbps, IPVS_STATS_ATTR_PAD)) goto nla_put_failure; nla_nest_end(skb, nl_stats); return 0; nla_put_failure: nla_nest_cancel(skb, nl_stats); return -EMSGSIZE; } static int ip_vs_genl_fill_service(struct sk_buff *skb, struct ip_vs_service *svc) { struct ip_vs_scheduler *sched; struct ip_vs_pe *pe; struct nlattr *nl_service; struct ip_vs_flags flags = { .flags = svc->flags, .mask = ~0 }; struct ip_vs_kstats kstats; char *sched_name; nl_service = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_SERVICE); if (!nl_service) return -EMSGSIZE; if (nla_put_u16(skb, IPVS_SVC_ATTR_AF, svc->af)) goto nla_put_failure; if (svc->fwmark) { if (nla_put_u32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark)) goto nla_put_failure; } else { if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) || nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) || nla_put_be16(skb, IPVS_SVC_ATTR_PORT, svc->port)) goto nla_put_failure; } sched = rcu_dereference_protected(svc->scheduler, 1); sched_name = sched ? sched->name : "none"; pe = rcu_dereference_protected(svc->pe, 1); if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched_name) || (pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) || nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) || nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) || nla_put_be32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask)) goto nla_put_failure; ip_vs_copy_stats(&kstats, &svc->stats); if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &kstats)) goto nla_put_failure; if (ip_vs_genl_fill_stats64(skb, IPVS_SVC_ATTR_STATS64, &kstats)) goto nla_put_failure; nla_nest_end(skb, nl_service); return 0; nla_put_failure: nla_nest_cancel(skb, nl_service); return -EMSGSIZE; } static int ip_vs_genl_dump_service(struct sk_buff *skb, struct ip_vs_service *svc, struct netlink_callback *cb) { void *hdr; hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &ip_vs_genl_family, NLM_F_MULTI, IPVS_CMD_NEW_SERVICE); if (!hdr) return -EMSGSIZE; if (ip_vs_genl_fill_service(skb, svc) < 0) goto nla_put_failure; genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -EMSGSIZE; } static int ip_vs_genl_dump_services(struct sk_buff *skb, struct netlink_callback *cb) { int idx = 0, i; int start = cb->args[0]; struct ip_vs_service *svc; struct net *net = sock_net(skb->sk); struct netns_ipvs *ipvs = net_ipvs(net); mutex_lock(&__ip_vs_mutex); for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { hlist_for_each_entry(svc, &ip_vs_svc_table[i], s_list) { if (++idx <= start || (svc->ipvs != ipvs)) continue; if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { idx--; goto nla_put_failure; } } } for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) { if (++idx <= start || (svc->ipvs != ipvs)) continue; if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { idx--; goto nla_put_failure; } } } nla_put_failure: mutex_unlock(&__ip_vs_mutex); cb->args[0] = idx; return skb->len; } static bool ip_vs_is_af_valid(int af) { if (af == AF_INET) return true; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6 && ipv6_mod_enabled()) return true; #endif return false; } static int ip_vs_genl_parse_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *usvc, struct nlattr *nla, bool full_entry, struct ip_vs_service **ret_svc) { struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1]; struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr; struct ip_vs_service *svc; /* Parse mandatory identifying service fields first */ if (nla == NULL || nla_parse_nested_deprecated(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy, NULL)) return -EINVAL; nla_af = attrs[IPVS_SVC_ATTR_AF]; nla_protocol = attrs[IPVS_SVC_ATTR_PROTOCOL]; nla_addr = attrs[IPVS_SVC_ATTR_ADDR]; nla_port = attrs[IPVS_SVC_ATTR_PORT]; nla_fwmark = attrs[IPVS_SVC_ATTR_FWMARK]; if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr)))) return -EINVAL; memset(usvc, 0, sizeof(*usvc)); usvc->af = nla_get_u16(nla_af); if (!ip_vs_is_af_valid(usvc->af)) return -EAFNOSUPPORT; if (nla_fwmark) { usvc->protocol = IPPROTO_TCP; usvc->fwmark = nla_get_u32(nla_fwmark); } else { usvc->protocol = nla_get_u16(nla_protocol); nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr)); usvc->port = nla_get_be16(nla_port); usvc->fwmark = 0; } rcu_read_lock(); if (usvc->fwmark) svc = __ip_vs_svc_fwm_find(ipvs, usvc->af, usvc->fwmark); else svc = __ip_vs_service_find(ipvs, usvc->af, usvc->protocol, &usvc->addr, usvc->port); rcu_read_unlock(); *ret_svc = svc; /* If a full entry was requested, check for the additional fields */ if (full_entry) { struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout, *nla_netmask; struct ip_vs_flags flags; nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME]; nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME]; nla_flags = attrs[IPVS_SVC_ATTR_FLAGS]; nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT]; nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK]; if (!(nla_sched && nla_flags && nla_timeout && nla_netmask)) return -EINVAL; nla_memcpy(&flags, nla_flags, sizeof(flags)); /* prefill flags from service if it already exists */ if (svc) usvc->flags = svc->flags; /* set new flags from userland */ usvc->flags = (usvc->flags & ~flags.mask) | (flags.flags & flags.mask); usvc->sched_name = nla_data(nla_sched); usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL; usvc->timeout = nla_get_u32(nla_timeout); usvc->netmask = nla_get_be32(nla_netmask); } return 0; } static struct ip_vs_service *ip_vs_genl_find_service(struct netns_ipvs *ipvs, struct nlattr *nla) { struct ip_vs_service_user_kern usvc; struct ip_vs_service *svc; int ret; ret = ip_vs_genl_parse_service(ipvs, &usvc, nla, false, &svc); return ret ? ERR_PTR(ret) : svc; } static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) { struct nlattr *nl_dest; struct ip_vs_kstats kstats; nl_dest = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_DEST); if (!nl_dest) return -EMSGSIZE; if (nla_put(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr) || nla_put_be16(skb, IPVS_DEST_ATTR_PORT, dest->port) || nla_put_u32(skb, IPVS_DEST_ATTR_FWD_METHOD, (atomic_read(&dest->conn_flags) & IP_VS_CONN_F_FWD_MASK)) || nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight)) || nla_put_u8(skb, IPVS_DEST_ATTR_TUN_TYPE, dest->tun_type) || nla_put_be16(skb, IPVS_DEST_ATTR_TUN_PORT, dest->tun_port) || nla_put_u16(skb, IPVS_DEST_ATTR_TUN_FLAGS, dest->tun_flags) || nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) || nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) || nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, atomic_read(&dest->activeconns)) || nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS, atomic_read(&dest->inactconns)) || nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS, atomic_read(&dest->persistconns)) || nla_put_u16(skb, IPVS_DEST_ATTR_ADDR_FAMILY, dest->af)) goto nla_put_failure; ip_vs_copy_stats(&kstats, &dest->stats); if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &kstats)) goto nla_put_failure; if (ip_vs_genl_fill_stats64(skb, IPVS_DEST_ATTR_STATS64, &kstats)) goto nla_put_failure; nla_nest_end(skb, nl_dest); return 0; nla_put_failure: nla_nest_cancel(skb, nl_dest); return -EMSGSIZE; } static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest, struct netlink_callback *cb) { void *hdr; hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &ip_vs_genl_family, NLM_F_MULTI, IPVS_CMD_NEW_DEST); if (!hdr) return -EMSGSIZE; if (ip_vs_genl_fill_dest(skb, dest) < 0) goto nla_put_failure; genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -EMSGSIZE; } static int ip_vs_genl_dump_dests(struct sk_buff *skb, struct netlink_callback *cb) { int idx = 0; int start = cb->args[0]; struct ip_vs_service *svc; struct ip_vs_dest *dest; struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1]; struct net *net = sock_net(skb->sk); struct netns_ipvs *ipvs = net_ipvs(net); mutex_lock(&__ip_vs_mutex); /* Try to find the service for which to dump destinations */ if (nlmsg_parse_deprecated(cb->nlh, GENL_HDRLEN, attrs, IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy, cb->extack)) goto out_err; svc = ip_vs_genl_find_service(ipvs, attrs[IPVS_CMD_ATTR_SERVICE]); if (IS_ERR_OR_NULL(svc)) goto out_err; /* Dump the destinations */ list_for_each_entry(dest, &svc->destinations, n_list) { if (++idx <= start) continue; if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) { idx--; goto nla_put_failure; } } nla_put_failure: cb->args[0] = idx; out_err: mutex_unlock(&__ip_vs_mutex); return skb->len; } static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, struct nlattr *nla, bool full_entry) { struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1]; struct nlattr *nla_addr, *nla_port; struct nlattr *nla_addr_family; /* Parse mandatory identifying destination fields first */ if (nla == NULL || nla_parse_nested_deprecated(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy, NULL)) return -EINVAL; nla_addr = attrs[IPVS_DEST_ATTR_ADDR]; nla_port = attrs[IPVS_DEST_ATTR_PORT]; nla_addr_family = attrs[IPVS_DEST_ATTR_ADDR_FAMILY]; if (!(nla_addr && nla_port)) return -EINVAL; memset(udest, 0, sizeof(*udest)); nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr)); udest->port = nla_get_be16(nla_port); if (nla_addr_family) udest->af = nla_get_u16(nla_addr_family); else udest->af = 0; /* If a full entry was requested, check for the additional fields */ if (full_entry) { struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh, *nla_l_thresh, *nla_tun_type, *nla_tun_port, *nla_tun_flags; nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD]; nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT]; nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH]; nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH]; nla_tun_type = attrs[IPVS_DEST_ATTR_TUN_TYPE]; nla_tun_port = attrs[IPVS_DEST_ATTR_TUN_PORT]; nla_tun_flags = attrs[IPVS_DEST_ATTR_TUN_FLAGS]; if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh)) return -EINVAL; udest->conn_flags = nla_get_u32(nla_fwd) & IP_VS_CONN_F_FWD_MASK; udest->weight = nla_get_u32(nla_weight); udest->u_threshold = nla_get_u32(nla_u_thresh); udest->l_threshold = nla_get_u32(nla_l_thresh); if (nla_tun_type) udest->tun_type = nla_get_u8(nla_tun_type); if (nla_tun_port) udest->tun_port = nla_get_be16(nla_tun_port); if (nla_tun_flags) udest->tun_flags = nla_get_u16(nla_tun_flags); } return 0; } static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state, struct ipvs_sync_daemon_cfg *c) { struct nlattr *nl_daemon; nl_daemon = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_DAEMON); if (!nl_daemon) return -EMSGSIZE; if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) || nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, c->mcast_ifn) || nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, c->syncid) || nla_put_u16(skb, IPVS_DAEMON_ATTR_SYNC_MAXLEN, c->sync_maxlen) || nla_put_u16(skb, IPVS_DAEMON_ATTR_MCAST_PORT, c->mcast_port) || nla_put_u8(skb, IPVS_DAEMON_ATTR_MCAST_TTL, c->mcast_ttl)) goto nla_put_failure; #ifdef CONFIG_IP_VS_IPV6 if (c->mcast_af == AF_INET6) { if (nla_put_in6_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP6, &c->mcast_group.in6)) goto nla_put_failure; } else #endif if (c->mcast_af == AF_INET && nla_put_in_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP, c->mcast_group.ip)) goto nla_put_failure; nla_nest_end(skb, nl_daemon); return 0; nla_put_failure: nla_nest_cancel(skb, nl_daemon); return -EMSGSIZE; } static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state, struct ipvs_sync_daemon_cfg *c, struct netlink_callback *cb) { void *hdr; hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &ip_vs_genl_family, NLM_F_MULTI, IPVS_CMD_NEW_DAEMON); if (!hdr) return -EMSGSIZE; if (ip_vs_genl_fill_daemon(skb, state, c)) goto nla_put_failure; genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -EMSGSIZE; } static int ip_vs_genl_dump_daemons(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct netns_ipvs *ipvs = net_ipvs(net); mutex_lock(&ipvs->sync_mutex); if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) { if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER, &ipvs->mcfg, cb) < 0) goto nla_put_failure; cb->args[0] = 1; } if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) { if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP, &ipvs->bcfg, cb) < 0) goto nla_put_failure; cb->args[1] = 1; } nla_put_failure: mutex_unlock(&ipvs->sync_mutex); return skb->len; } static int ip_vs_genl_new_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs) { struct ipvs_sync_daemon_cfg c; struct nlattr *a; int ret; memset(&c, 0, sizeof(c)); if (!(attrs[IPVS_DAEMON_ATTR_STATE] && attrs[IPVS_DAEMON_ATTR_MCAST_IFN] && attrs[IPVS_DAEMON_ATTR_SYNC_ID])) return -EINVAL; strscpy(c.mcast_ifn, nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), sizeof(c.mcast_ifn)); c.syncid = nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]); a = attrs[IPVS_DAEMON_ATTR_SYNC_MAXLEN]; if (a) c.sync_maxlen = nla_get_u16(a); a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP]; if (a) { c.mcast_af = AF_INET; c.mcast_group.ip = nla_get_in_addr(a); if (!ipv4_is_multicast(c.mcast_group.ip)) return -EINVAL; } else { a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP6]; if (a) { #ifdef CONFIG_IP_VS_IPV6 int addr_type; c.mcast_af = AF_INET6; c.mcast_group.in6 = nla_get_in6_addr(a); addr_type = ipv6_addr_type(&c.mcast_group.in6); if (!(addr_type & IPV6_ADDR_MULTICAST)) return -EINVAL; #else return -EAFNOSUPPORT; #endif } } a = attrs[IPVS_DAEMON_ATTR_MCAST_PORT]; if (a) c.mcast_port = nla_get_u16(a); a = attrs[IPVS_DAEMON_ATTR_MCAST_TTL]; if (a) c.mcast_ttl = nla_get_u8(a); /* The synchronization protocol is incompatible with mixed family * services */ if (ipvs->mixed_address_family_dests > 0) return -EINVAL; ret = start_sync_thread(ipvs, &c, nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); return ret; } static int ip_vs_genl_del_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs) { int ret; if (!attrs[IPVS_DAEMON_ATTR_STATE]) return -EINVAL; ret = stop_sync_thread(ipvs, nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); return ret; } static int ip_vs_genl_set_config(struct netns_ipvs *ipvs, struct nlattr **attrs) { struct ip_vs_timeout_user t; __ip_vs_get_timeouts(ipvs, &t); if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]) t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]); if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]) t.tcp_fin_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]); if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]) t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]); return ip_vs_set_timeout(ipvs, &t); } static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info) { int ret = -EINVAL, cmd; struct net *net = sock_net(skb->sk); struct netns_ipvs *ipvs = net_ipvs(net); cmd = info->genlhdr->cmd; if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) { struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1]; if (!info->attrs[IPVS_CMD_ATTR_DAEMON] || nla_parse_nested_deprecated(daemon_attrs, IPVS_DAEMON_ATTR_MAX, info->attrs[IPVS_CMD_ATTR_DAEMON], ip_vs_daemon_policy, info->extack)) goto out; if (cmd == IPVS_CMD_NEW_DAEMON) ret = ip_vs_genl_new_daemon(ipvs, daemon_attrs); else ret = ip_vs_genl_del_daemon(ipvs, daemon_attrs); } out: return ret; } static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) { bool need_full_svc = false, need_full_dest = false; struct ip_vs_service *svc = NULL; struct ip_vs_service_user_kern usvc; struct ip_vs_dest_user_kern udest; int ret = 0, cmd; struct net *net = sock_net(skb->sk); struct netns_ipvs *ipvs = net_ipvs(net); cmd = info->genlhdr->cmd; mutex_lock(&__ip_vs_mutex); if (cmd == IPVS_CMD_FLUSH) { ret = ip_vs_flush(ipvs, false); goto out; } else if (cmd == IPVS_CMD_SET_CONFIG) { ret = ip_vs_genl_set_config(ipvs, info->attrs); goto out; } else if (cmd == IPVS_CMD_ZERO && !info->attrs[IPVS_CMD_ATTR_SERVICE]) { ret = ip_vs_zero_all(ipvs); goto out; } /* All following commands require a service argument, so check if we * received a valid one. We need a full service specification when * adding / editing a service. Only identifying members otherwise. */ if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE) need_full_svc = true; ret = ip_vs_genl_parse_service(ipvs, &usvc, info->attrs[IPVS_CMD_ATTR_SERVICE], need_full_svc, &svc); if (ret) goto out; /* Unless we're adding a new service, the service must already exist */ if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) { ret = -ESRCH; goto out; } /* Destination commands require a valid destination argument. For * adding / editing a destination, we need a full destination * specification. */ if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST || cmd == IPVS_CMD_DEL_DEST) { if (cmd != IPVS_CMD_DEL_DEST) need_full_dest = true; ret = ip_vs_genl_parse_dest(&udest, info->attrs[IPVS_CMD_ATTR_DEST], need_full_dest); if (ret) goto out; /* Old protocols did not allow the user to specify address * family, so we set it to zero instead. We also didn't * allow heterogeneous pools in the old code, so it's safe * to assume that this will have the same address family as * the service. */ if (udest.af == 0) udest.af = svc->af; if (!ip_vs_is_af_valid(udest.af)) { ret = -EAFNOSUPPORT; goto out; } if (udest.af != svc->af && cmd != IPVS_CMD_DEL_DEST) { /* The synchronization protocol is incompatible * with mixed family services */ if (ipvs->sync_state) { ret = -EINVAL; goto out; } /* Which connection types do we support? */ switch (udest.conn_flags) { case IP_VS_CONN_F_TUNNEL: /* We are able to forward this */ break; default: ret = -EINVAL; goto out; } } } switch (cmd) { case IPVS_CMD_NEW_SERVICE: if (svc == NULL) ret = ip_vs_add_service(ipvs, &usvc, &svc); else ret = -EEXIST; break; case IPVS_CMD_SET_SERVICE: ret = ip_vs_edit_service(svc, &usvc); break; case IPVS_CMD_DEL_SERVICE: ret = ip_vs_del_service(svc); /* do not use svc, it can be freed */ break; case IPVS_CMD_NEW_DEST: ret = ip_vs_add_dest(svc, &udest); break; case IPVS_CMD_SET_DEST: ret = ip_vs_edit_dest(svc, &udest); break; case IPVS_CMD_DEL_DEST: ret = ip_vs_del_dest(svc, &udest); break; case IPVS_CMD_ZERO: ret = ip_vs_zero_service(svc); break; default: ret = -EINVAL; } out: mutex_unlock(&__ip_vs_mutex); return ret; } static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; void *reply; int ret, cmd, reply_cmd; struct net *net = sock_net(skb->sk); struct netns_ipvs *ipvs = net_ipvs(net); cmd = info->genlhdr->cmd; if (cmd == IPVS_CMD_GET_SERVICE) reply_cmd = IPVS_CMD_NEW_SERVICE; else if (cmd == IPVS_CMD_GET_INFO) reply_cmd = IPVS_CMD_SET_INFO; else if (cmd == IPVS_CMD_GET_CONFIG) reply_cmd = IPVS_CMD_SET_CONFIG; else { pr_err("unknown Generic Netlink command\n"); return -EINVAL; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; mutex_lock(&__ip_vs_mutex); reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd); if (reply == NULL) goto nla_put_failure; switch (cmd) { case IPVS_CMD_GET_SERVICE: { struct ip_vs_service *svc; svc = ip_vs_genl_find_service(ipvs, info->attrs[IPVS_CMD_ATTR_SERVICE]); if (IS_ERR(svc)) { ret = PTR_ERR(svc); goto out_err; } else if (svc) { ret = ip_vs_genl_fill_service(msg, svc); if (ret) goto nla_put_failure; } else { ret = -ESRCH; goto out_err; } break; } case IPVS_CMD_GET_CONFIG: { struct ip_vs_timeout_user t; __ip_vs_get_timeouts(ipvs, &t); #ifdef CONFIG_IP_VS_PROTO_TCP if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout) || nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN, t.tcp_fin_timeout)) goto nla_put_failure; #endif #ifdef CONFIG_IP_VS_PROTO_UDP if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout)) goto nla_put_failure; #endif break; } case IPVS_CMD_GET_INFO: if (nla_put_u32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE) || nla_put_u32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE, ip_vs_conn_tab_size)) goto nla_put_failure; break; } genlmsg_end(msg, reply); ret = genlmsg_reply(msg, info); goto out; nla_put_failure: pr_err("not enough space in Netlink message\n"); ret = -EMSGSIZE; out_err: nlmsg_free(msg); out: mutex_unlock(&__ip_vs_mutex); return ret; } static const struct genl_small_ops ip_vs_genl_ops[] = { { .cmd = IPVS_CMD_NEW_SERVICE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_cmd, }, { .cmd = IPVS_CMD_SET_SERVICE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_cmd, }, { .cmd = IPVS_CMD_DEL_SERVICE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_cmd, }, { .cmd = IPVS_CMD_GET_SERVICE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_get_cmd, .dumpit = ip_vs_genl_dump_services, }, { .cmd = IPVS_CMD_NEW_DEST, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_cmd, }, { .cmd = IPVS_CMD_SET_DEST, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_cmd, }, { .cmd = IPVS_CMD_DEL_DEST, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_cmd, }, { .cmd = IPVS_CMD_GET_DEST, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .dumpit = ip_vs_genl_dump_dests, }, { .cmd = IPVS_CMD_NEW_DAEMON, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_daemon, }, { .cmd = IPVS_CMD_DEL_DAEMON, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_daemon, }, { .cmd = IPVS_CMD_GET_DAEMON, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .dumpit = ip_vs_genl_dump_daemons, }, { .cmd = IPVS_CMD_SET_CONFIG, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_cmd, }, { .cmd = IPVS_CMD_GET_CONFIG, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_get_cmd, }, { .cmd = IPVS_CMD_GET_INFO, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_get_cmd, }, { .cmd = IPVS_CMD_ZERO, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_cmd, }, { .cmd = IPVS_CMD_FLUSH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_cmd, }, }; static struct genl_family ip_vs_genl_family __ro_after_init = { .hdrsize = 0, .name = IPVS_GENL_NAME, .version = IPVS_GENL_VERSION, .maxattr = IPVS_CMD_ATTR_MAX, .policy = ip_vs_cmd_policy, .netnsok = true, /* Make ipvsadm to work on netns */ .module = THIS_MODULE, .small_ops = ip_vs_genl_ops, .n_small_ops = ARRAY_SIZE(ip_vs_genl_ops), .resv_start_op = IPVS_CMD_FLUSH + 1, }; static int __init ip_vs_genl_register(void) { return genl_register_family(&ip_vs_genl_family); } static void ip_vs_genl_unregister(void) { genl_unregister_family(&ip_vs_genl_family); } /* End of Generic Netlink interface definitions */ /* * per netns intit/exit func. */ #ifdef CONFIG_SYSCTL static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) { struct net *net = ipvs->net; struct ctl_table *tbl; int idx, ret; size_t ctl_table_size = ARRAY_SIZE(vs_vars); atomic_set(&ipvs->dropentry, 0); spin_lock_init(&ipvs->dropentry_lock); spin_lock_init(&ipvs->droppacket_lock); spin_lock_init(&ipvs->securetcp_lock); INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler); INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work, expire_nodest_conn_handler); ipvs->est_stopped = 0; if (!net_eq(net, &init_net)) { tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL); if (tbl == NULL) return -ENOMEM; /* Don't export sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) { tbl[0].procname = NULL; ctl_table_size = 0; } } else tbl = vs_vars; /* Initialize sysctl defaults */ for (idx = 0; idx < ARRAY_SIZE(vs_vars); idx++) { if (tbl[idx].proc_handler == proc_do_defense_mode) tbl[idx].extra2 = ipvs; } idx = 0; ipvs->sysctl_amemthresh = 1024; tbl[idx++].data = &ipvs->sysctl_amemthresh; ipvs->sysctl_am_droprate = 10; tbl[idx++].data = &ipvs->sysctl_am_droprate; tbl[idx++].data = &ipvs->sysctl_drop_entry; tbl[idx++].data = &ipvs->sysctl_drop_packet; #ifdef CONFIG_IP_VS_NFCT tbl[idx++].data = &ipvs->sysctl_conntrack; #endif tbl[idx++].data = &ipvs->sysctl_secure_tcp; ipvs->sysctl_snat_reroute = 1; tbl[idx++].data = &ipvs->sysctl_snat_reroute; ipvs->sysctl_sync_ver = 1; tbl[idx++].data = &ipvs->sysctl_sync_ver; ipvs->sysctl_sync_ports = 1; tbl[idx++].data = &ipvs->sysctl_sync_ports; tbl[idx++].data = &ipvs->sysctl_sync_persist_mode; ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32; tbl[idx++].data = &ipvs->sysctl_sync_qlen_max; ipvs->sysctl_sync_sock_size = 0; tbl[idx++].data = &ipvs->sysctl_sync_sock_size; tbl[idx++].data = &ipvs->sysctl_cache_bypass; tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn; tbl[idx++].data = &ipvs->sysctl_sloppy_tcp; tbl[idx++].data = &ipvs->sysctl_sloppy_sctp; tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template; ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD; ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD; tbl[idx].data = &ipvs->sysctl_sync_threshold; tbl[idx].extra2 = ipvs; tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold); ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD; tbl[idx++].data = &ipvs->sysctl_sync_refresh_period; ipvs->sysctl_sync_retries = clamp_t(int, DEFAULT_SYNC_RETRIES, 0, 3); tbl[idx++].data = &ipvs->sysctl_sync_retries; tbl[idx++].data = &ipvs->sysctl_nat_icmp_send; ipvs->sysctl_pmtu_disc = 1; tbl[idx++].data = &ipvs->sysctl_pmtu_disc; tbl[idx++].data = &ipvs->sysctl_backup_only; ipvs->sysctl_conn_reuse_mode = 1; tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode; tbl[idx++].data = &ipvs->sysctl_schedule_icmp; tbl[idx++].data = &ipvs->sysctl_ignore_tunneled; ipvs->sysctl_run_estimation = 1; tbl[idx].extra2 = ipvs; tbl[idx++].data = &ipvs->sysctl_run_estimation; ipvs->est_cpulist_valid = 0; tbl[idx].extra2 = ipvs; tbl[idx++].data = &ipvs->sysctl_est_cpulist; ipvs->sysctl_est_nice = IPVS_EST_NICE; tbl[idx].extra2 = ipvs; tbl[idx++].data = &ipvs->sysctl_est_nice; #ifdef CONFIG_IP_VS_DEBUG /* Global sysctls must be ro in non-init netns */ if (!net_eq(net, &init_net)) tbl[idx++].mode = 0444; #endif ret = -ENOMEM; ipvs->sysctl_hdr = register_net_sysctl_sz(net, "net/ipv4/vs", tbl, ctl_table_size); if (!ipvs->sysctl_hdr) goto err; ipvs->sysctl_tbl = tbl; ret = ip_vs_start_estimator(ipvs, &ipvs->tot_stats->s); if (ret < 0) goto err; /* Schedule defense work */ queue_delayed_work(system_long_wq, &ipvs->defense_work, DEFENSE_TIMER_PERIOD); return 0; err: unregister_net_sysctl_table(ipvs->sysctl_hdr); if (!net_eq(net, &init_net)) kfree(tbl); return ret; } static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) { struct net *net = ipvs->net; cancel_delayed_work_sync(&ipvs->expire_nodest_conn_work); cancel_delayed_work_sync(&ipvs->defense_work); cancel_work_sync(&ipvs->defense_work.work); unregister_net_sysctl_table(ipvs->sysctl_hdr); ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s); if (ipvs->est_cpulist_valid) free_cpumask_var(ipvs->sysctl_est_cpulist); if (!net_eq(net, &init_net)) kfree(ipvs->sysctl_tbl); } #else static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) { return 0; } static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) { } #endif static struct notifier_block ip_vs_dst_notifier = { .notifier_call = ip_vs_dst_event, #ifdef CONFIG_IP_VS_IPV6 .priority = ADDRCONF_NOTIFY_PRIORITY + 5, #endif }; int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) { int ret = -ENOMEM; int idx; /* Initialize rs_table */ for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) INIT_HLIST_HEAD(&ipvs->rs_table[idx]); INIT_LIST_HEAD(&ipvs->dest_trash); spin_lock_init(&ipvs->dest_trash_lock); timer_setup(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire, 0); atomic_set(&ipvs->ftpsvc_counter, 0); atomic_set(&ipvs->nullsvc_counter, 0); atomic_set(&ipvs->conn_out_counter, 0); INIT_DELAYED_WORK(&ipvs->est_reload_work, est_reload_work_handler); /* procfs stats */ ipvs->tot_stats = kzalloc(sizeof(*ipvs->tot_stats), GFP_KERNEL); if (!ipvs->tot_stats) goto out; if (ip_vs_stats_init_alloc(&ipvs->tot_stats->s) < 0) goto err_tot_stats; #ifdef CONFIG_PROC_FS if (!proc_create_net("ip_vs", 0, ipvs->net->proc_net, &ip_vs_info_seq_ops, sizeof(struct ip_vs_iter))) goto err_vs; if (!proc_create_net_single("ip_vs_stats", 0, ipvs->net->proc_net, ip_vs_stats_show, NULL)) goto err_stats; if (!proc_create_net_single("ip_vs_stats_percpu", 0, ipvs->net->proc_net, ip_vs_stats_percpu_show, NULL)) goto err_percpu; #endif ret = ip_vs_control_net_init_sysctl(ipvs); if (ret < 0) goto err; return 0; err: #ifdef CONFIG_PROC_FS remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net); err_percpu: remove_proc_entry("ip_vs_stats", ipvs->net->proc_net); err_stats: remove_proc_entry("ip_vs", ipvs->net->proc_net); err_vs: #endif ip_vs_stats_release(&ipvs->tot_stats->s); err_tot_stats: kfree(ipvs->tot_stats); out: return ret; } void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs) { ip_vs_trash_cleanup(ipvs); ip_vs_control_net_cleanup_sysctl(ipvs); cancel_delayed_work_sync(&ipvs->est_reload_work); #ifdef CONFIG_PROC_FS remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net); remove_proc_entry("ip_vs_stats", ipvs->net->proc_net); remove_proc_entry("ip_vs", ipvs->net->proc_net); #endif call_rcu(&ipvs->tot_stats->rcu_head, ip_vs_stats_rcu_free); } int __init ip_vs_register_nl_ioctl(void) { int ret; ret = nf_register_sockopt(&ip_vs_sockopts); if (ret) { pr_err("cannot register sockopt.\n"); goto err_sock; } ret = ip_vs_genl_register(); if (ret) { pr_err("cannot register Generic Netlink interface.\n"); goto err_genl; } return 0; err_genl: nf_unregister_sockopt(&ip_vs_sockopts); err_sock: return ret; } void ip_vs_unregister_nl_ioctl(void) { ip_vs_genl_unregister(); nf_unregister_sockopt(&ip_vs_sockopts); } int __init ip_vs_control_init(void) { int idx; int ret; /* Initialize svc_table, ip_vs_svc_fwm_table */ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { INIT_HLIST_HEAD(&ip_vs_svc_table[idx]); INIT_HLIST_HEAD(&ip_vs_svc_fwm_table[idx]); } smp_wmb(); /* Do we really need it now ? */ ret = register_netdevice_notifier(&ip_vs_dst_notifier); if (ret < 0) return ret; return 0; } void ip_vs_control_cleanup(void) { unregister_netdevice_notifier(&ip_vs_dst_notifier); /* relying on common rcu_barrier() in ip_vs_cleanup() */ } |
7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NET_IP6_TUNNEL_H #define _NET_IP6_TUNNEL_H #include <linux/ipv6.h> #include <linux/netdevice.h> #include <linux/if_tunnel.h> #include <linux/ip6_tunnel.h> #include <net/ip_tunnels.h> #include <net/dst_cache.h> #define IP6TUNNEL_ERR_TIMEO (30*HZ) /* capable of sending packets */ #define IP6_TNL_F_CAP_XMIT 0x10000 /* capable of receiving packets */ #define IP6_TNL_F_CAP_RCV 0x20000 /* determine capability on a per-packet basis */ #define IP6_TNL_F_CAP_PER_PACKET 0x40000 struct __ip6_tnl_parm { char name[IFNAMSIZ]; /* name of tunnel device */ int link; /* ifindex of underlying L2 interface */ __u8 proto; /* tunnel protocol */ __u8 encap_limit; /* encapsulation limit for tunnel */ __u8 hop_limit; /* hop limit for tunnel */ bool collect_md; __be32 flowinfo; /* traffic class and flowlabel for tunnel */ __u32 flags; /* tunnel flags */ struct in6_addr laddr; /* local tunnel end-point address */ struct in6_addr raddr; /* remote tunnel end-point address */ __be16 i_flags; __be16 o_flags; __be32 i_key; __be32 o_key; __u32 fwmark; __u32 index; /* ERSPAN type II index */ __u8 erspan_ver; /* ERSPAN version */ __u8 dir; /* direction */ __u16 hwid; /* hwid */ }; /* IPv6 tunnel */ struct ip6_tnl { struct ip6_tnl __rcu *next; /* next tunnel in list */ struct net_device *dev; /* virtual device associated with tunnel */ netdevice_tracker dev_tracker; struct net *net; /* netns for packet i/o */ struct __ip6_tnl_parm parms; /* tunnel configuration parameters */ struct flowi fl; /* flowi template for xmit */ struct dst_cache dst_cache; /* cached dst */ struct gro_cells gro_cells; int err_count; unsigned long err_time; /* These fields used only by GRE */ __u32 i_seqno; /* The last seen seqno */ atomic_t o_seqno; /* The last output seqno */ int hlen; /* tun_hlen + encap_hlen */ int tun_hlen; /* Precalculated header length */ int encap_hlen; /* Encap header length (FOU,GUE) */ struct ip_tunnel_encap encap; int mlink; }; struct ip6_tnl_encap_ops { size_t (*encap_hlen)(struct ip_tunnel_encap *e); int (*build_header)(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, struct flowi6 *fl6); int (*err_handler)(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info); }; #ifdef CONFIG_INET extern const struct ip6_tnl_encap_ops __rcu * ip6tun_encaps[MAX_IPTUN_ENCAP_OPS]; int ip6_tnl_encap_add_ops(const struct ip6_tnl_encap_ops *ops, unsigned int num); int ip6_tnl_encap_del_ops(const struct ip6_tnl_encap_ops *ops, unsigned int num); int ip6_tnl_encap_setup(struct ip6_tnl *t, struct ip_tunnel_encap *ipencap); static inline int ip6_encap_hlen(struct ip_tunnel_encap *e) { const struct ip6_tnl_encap_ops *ops; int hlen = -EINVAL; if (e->type == TUNNEL_ENCAP_NONE) return 0; if (e->type >= MAX_IPTUN_ENCAP_OPS) return -EINVAL; rcu_read_lock(); ops = rcu_dereference(ip6tun_encaps[e->type]); if (likely(ops && ops->encap_hlen)) hlen = ops->encap_hlen(e); rcu_read_unlock(); return hlen; } static inline int ip6_tnl_encap(struct sk_buff *skb, struct ip6_tnl *t, u8 *protocol, struct flowi6 *fl6) { const struct ip6_tnl_encap_ops *ops; int ret = -EINVAL; if (t->encap.type == TUNNEL_ENCAP_NONE) return 0; if (t->encap.type >= MAX_IPTUN_ENCAP_OPS) return -EINVAL; rcu_read_lock(); ops = rcu_dereference(ip6tun_encaps[t->encap.type]); if (likely(ops && ops->build_header)) ret = ops->build_header(skb, &t->encap, protocol, fl6); rcu_read_unlock(); return ret; } /* Tunnel encapsulation limit destination sub-option */ struct ipv6_tlv_tnl_enc_lim { __u8 type; /* type-code for option */ __u8 length; /* option length */ __u8 encap_limit; /* tunnel encapsulation limit */ } __packed; int ip6_tnl_rcv_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr); int ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, bool log_ecn_error); int ip6_tnl_xmit_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr); int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, struct flowi6 *fl6, int encap_limit, __u32 *pmtu, __u8 proto); __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw); __u32 ip6_tnl_get_cap(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr); struct net *ip6_tnl_get_link_net(const struct net_device *dev); int ip6_tnl_get_iflink(const struct net_device *dev); int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu); static inline void ip6tunnel_xmit(struct sock *sk, struct sk_buff *skb, struct net_device *dev) { int pkt_len, err; memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); pkt_len = skb->len - skb_inner_network_offset(skb); err = ip6_local_out(dev_net(skb_dst(skb)->dev), sk, skb); if (dev) { if (unlikely(net_xmit_eval(err))) pkt_len = -1; iptunnel_xmit_stats(dev, pkt_len); } } #endif #endif |
1110 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 | // SPDX-License-Identifier: GPL-2.0-only /* * Dynamic DMA mapping support. * * This implementation is a fallback for platforms that do not support * I/O TLBs (aka DMA address translation hardware). * Copyright (C) 2000 Asit Mallick <Asit.K.Mallick@intel.com> * Copyright (C) 2000 Goutham Rao <goutham.rao@intel.com> * Copyright (C) 2000, 2003 Hewlett-Packard Co * David Mosberger-Tang <davidm@hpl.hp.com> * * 03/05/07 davidm Switch from PCI-DMA to generic device DMA API. * 00/12/13 davidm Rename to swiotlb.c and add mark_clean() to avoid * unnecessary i-cache flushing. * 04/07/.. ak Better overflow handling. Assorted fixes. * 05/09/10 linville Add support for syncing ranges, support syncing for * DMA_BIDIRECTIONAL mappings, miscellaneous cleanup. * 08/12/11 beckyb Add highmem support */ #define pr_fmt(fmt) "software IO TLB: " fmt #include <linux/cache.h> #include <linux/cc_platform.h> #include <linux/ctype.h> #include <linux/debugfs.h> #include <linux/dma-direct.h> #include <linux/dma-map-ops.h> #include <linux/export.h> #include <linux/gfp.h> #include <linux/highmem.h> #include <linux/io.h> #include <linux/iommu-helper.h> #include <linux/init.h> #include <linux/memblock.h> #include <linux/mm.h> #include <linux/pfn.h> #include <linux/rculist.h> #include <linux/scatterlist.h> #include <linux/set_memory.h> #include <linux/spinlock.h> #include <linux/string.h> #include <linux/swiotlb.h> #include <linux/types.h> #ifdef CONFIG_DMA_RESTRICTED_POOL #include <linux/of.h> #include <linux/of_fdt.h> #include <linux/of_reserved_mem.h> #include <linux/slab.h> #endif #define CREATE_TRACE_POINTS #include <trace/events/swiotlb.h> #define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT)) /* * Minimum IO TLB size to bother booting with. Systems with mainly * 64bit capable cards will only lightly use the swiotlb. If we can't * allocate a contiguous 1MB, we're probably in trouble anyway. */ #define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) #define INVALID_PHYS_ADDR (~(phys_addr_t)0) /** * struct io_tlb_slot - IO TLB slot descriptor * @orig_addr: The original address corresponding to a mapped entry. * @alloc_size: Size of the allocated buffer. * @list: The free list describing the number of free entries available * from each index. */ struct io_tlb_slot { phys_addr_t orig_addr; size_t alloc_size; unsigned int list; }; static bool swiotlb_force_bounce; static bool swiotlb_force_disable; #ifdef CONFIG_SWIOTLB_DYNAMIC static void swiotlb_dyn_alloc(struct work_struct *work); static struct io_tlb_mem io_tlb_default_mem = { .lock = __SPIN_LOCK_UNLOCKED(io_tlb_default_mem.lock), .pools = LIST_HEAD_INIT(io_tlb_default_mem.pools), .dyn_alloc = __WORK_INITIALIZER(io_tlb_default_mem.dyn_alloc, swiotlb_dyn_alloc), }; #else /* !CONFIG_SWIOTLB_DYNAMIC */ static struct io_tlb_mem io_tlb_default_mem; #endif /* CONFIG_SWIOTLB_DYNAMIC */ static unsigned long default_nslabs = IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT; static unsigned long default_nareas; /** * struct io_tlb_area - IO TLB memory area descriptor * * This is a single area with a single lock. * * @used: The number of used IO TLB block. * @index: The slot index to start searching in this area for next round. * @lock: The lock to protect the above data structures in the map and * unmap calls. */ struct io_tlb_area { unsigned long used; unsigned int index; spinlock_t lock; }; /* * Round up number of slabs to the next power of 2. The last area is going * be smaller than the rest if default_nslabs is not power of two. * The number of slot in an area should be a multiple of IO_TLB_SEGSIZE, * otherwise a segment may span two or more areas. It conflicts with free * contiguous slots tracking: free slots are treated contiguous no matter * whether they cross an area boundary. * * Return true if default_nslabs is rounded up. */ static bool round_up_default_nslabs(void) { if (!default_nareas) return false; if (default_nslabs < IO_TLB_SEGSIZE * default_nareas) default_nslabs = IO_TLB_SEGSIZE * default_nareas; else if (is_power_of_2(default_nslabs)) return false; default_nslabs = roundup_pow_of_two(default_nslabs); return true; } /** * swiotlb_adjust_nareas() - adjust the number of areas and slots * @nareas: Desired number of areas. Zero is treated as 1. * * Adjust the default number of areas in a memory pool. * The default size of the memory pool may also change to meet minimum area * size requirements. */ static void swiotlb_adjust_nareas(unsigned int nareas) { if (!nareas) nareas = 1; else if (!is_power_of_2(nareas)) nareas = roundup_pow_of_two(nareas); default_nareas = nareas; pr_info("area num %d.\n", nareas); if (round_up_default_nslabs()) pr_info("SWIOTLB bounce buffer size roundup to %luMB", (default_nslabs << IO_TLB_SHIFT) >> 20); } /** * limit_nareas() - get the maximum number of areas for a given memory pool size * @nareas: Desired number of areas. * @nslots: Total number of slots in the memory pool. * * Limit the number of areas to the maximum possible number of areas in * a memory pool of the given size. * * Return: Maximum possible number of areas. */ static unsigned int limit_nareas(unsigned int nareas, unsigned long nslots) { if (nslots < nareas * IO_TLB_SEGSIZE) return nslots / IO_TLB_SEGSIZE; return nareas; } static int __init setup_io_tlb_npages(char *str) { if (isdigit(*str)) { /* avoid tail segment of size < IO_TLB_SEGSIZE */ default_nslabs = ALIGN(simple_strtoul(str, &str, 0), IO_TLB_SEGSIZE); } if (*str == ',') ++str; if (isdigit(*str)) swiotlb_adjust_nareas(simple_strtoul(str, &str, 0)); if (*str == ',') ++str; if (!strcmp(str, "force")) swiotlb_force_bounce = true; else if (!strcmp(str, "noforce")) swiotlb_force_disable = true; return 0; } early_param("swiotlb", setup_io_tlb_npages); unsigned long swiotlb_size_or_default(void) { return default_nslabs << IO_TLB_SHIFT; } void __init swiotlb_adjust_size(unsigned long size) { /* * If swiotlb parameter has not been specified, give a chance to * architectures such as those supporting memory encryption to * adjust/expand SWIOTLB size for their use. */ if (default_nslabs != IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT) return; size = ALIGN(size, IO_TLB_SIZE); default_nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE); if (round_up_default_nslabs()) size = default_nslabs << IO_TLB_SHIFT; pr_info("SWIOTLB bounce buffer size adjusted to %luMB", size >> 20); } void swiotlb_print_info(void) { struct io_tlb_pool *mem = &io_tlb_default_mem.defpool; if (!mem->nslabs) { pr_warn("No low mem\n"); return; } pr_info("mapped [mem %pa-%pa] (%luMB)\n", &mem->start, &mem->end, (mem->nslabs << IO_TLB_SHIFT) >> 20); } static inline unsigned long io_tlb_offset(unsigned long val) { return val & (IO_TLB_SEGSIZE - 1); } static inline unsigned long nr_slots(u64 val) { return DIV_ROUND_UP(val, IO_TLB_SIZE); } /* * Early SWIOTLB allocation may be too early to allow an architecture to * perform the desired operations. This function allows the architecture to * call SWIOTLB when the operations are possible. It needs to be called * before the SWIOTLB memory is used. */ void __init swiotlb_update_mem_attributes(void) { struct io_tlb_pool *mem = &io_tlb_default_mem.defpool; unsigned long bytes; if (!mem->nslabs || mem->late_alloc) return; bytes = PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT); set_memory_decrypted((unsigned long)mem->vaddr, bytes >> PAGE_SHIFT); } static void swiotlb_init_io_tlb_pool(struct io_tlb_pool *mem, phys_addr_t start, unsigned long nslabs, bool late_alloc, unsigned int nareas) { void *vaddr = phys_to_virt(start); unsigned long bytes = nslabs << IO_TLB_SHIFT, i; mem->nslabs = nslabs; mem->start = start; mem->end = mem->start + bytes; mem->late_alloc = late_alloc; mem->nareas = nareas; mem->area_nslabs = nslabs / mem->nareas; for (i = 0; i < mem->nareas; i++) { spin_lock_init(&mem->areas[i].lock); mem->areas[i].index = 0; mem->areas[i].used = 0; } for (i = 0; i < mem->nslabs; i++) { mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i); mem->slots[i].orig_addr = INVALID_PHYS_ADDR; mem->slots[i].alloc_size = 0; } memset(vaddr, 0, bytes); mem->vaddr = vaddr; return; } /** * add_mem_pool() - add a memory pool to the allocator * @mem: Software IO TLB allocator. * @pool: Memory pool to be added. */ static void add_mem_pool(struct io_tlb_mem *mem, struct io_tlb_pool *pool) { #ifdef CONFIG_SWIOTLB_DYNAMIC spin_lock(&mem->lock); list_add_rcu(&pool->node, &mem->pools); mem->nslabs += pool->nslabs; spin_unlock(&mem->lock); #else mem->nslabs = pool->nslabs; #endif } static void __init *swiotlb_memblock_alloc(unsigned long nslabs, unsigned int flags, int (*remap)(void *tlb, unsigned long nslabs)) { size_t bytes = PAGE_ALIGN(nslabs << IO_TLB_SHIFT); void *tlb; /* * By default allocate the bounce buffer memory from low memory, but * allow to pick a location everywhere for hypervisors with guest * memory encryption. */ if (flags & SWIOTLB_ANY) tlb = memblock_alloc(bytes, PAGE_SIZE); else tlb = memblock_alloc_low(bytes, PAGE_SIZE); if (!tlb) { pr_warn("%s: Failed to allocate %zu bytes tlb structure\n", __func__, bytes); return NULL; } if (remap && remap(tlb, nslabs) < 0) { memblock_free(tlb, PAGE_ALIGN(bytes)); pr_warn("%s: Failed to remap %zu bytes\n", __func__, bytes); return NULL; } return tlb; } /* * Statically reserve bounce buffer space and initialize bounce buffer data * structures for the software IO TLB used to implement the DMA API. */ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags, int (*remap)(void *tlb, unsigned long nslabs)) { struct io_tlb_pool *mem = &io_tlb_default_mem.defpool; unsigned long nslabs; unsigned int nareas; size_t alloc_size; void *tlb; if (!addressing_limit && !swiotlb_force_bounce) return; if (swiotlb_force_disable) return; io_tlb_default_mem.force_bounce = swiotlb_force_bounce || (flags & SWIOTLB_FORCE); #ifdef CONFIG_SWIOTLB_DYNAMIC if (!remap) io_tlb_default_mem.can_grow = true; if (flags & SWIOTLB_ANY) io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1); else io_tlb_default_mem.phys_limit = ARCH_LOW_ADDRESS_LIMIT; #endif if (!default_nareas) swiotlb_adjust_nareas(num_possible_cpus()); nslabs = default_nslabs; nareas = limit_nareas(default_nareas, nslabs); while ((tlb = swiotlb_memblock_alloc(nslabs, flags, remap)) == NULL) { if (nslabs <= IO_TLB_MIN_SLABS) return; nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE); nareas = limit_nareas(nareas, nslabs); } if (default_nslabs != nslabs) { pr_info("SWIOTLB bounce buffer size adjusted %lu -> %lu slabs", default_nslabs, nslabs); default_nslabs = nslabs; } alloc_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), nslabs)); mem->slots = memblock_alloc(alloc_size, PAGE_SIZE); if (!mem->slots) { pr_warn("%s: Failed to allocate %zu bytes align=0x%lx\n", __func__, alloc_size, PAGE_SIZE); return; } mem->areas = memblock_alloc(array_size(sizeof(struct io_tlb_area), nareas), SMP_CACHE_BYTES); if (!mem->areas) { pr_warn("%s: Failed to allocate mem->areas.\n", __func__); return; } swiotlb_init_io_tlb_pool(mem, __pa(tlb), nslabs, false, nareas); add_mem_pool(&io_tlb_default_mem, mem); if (flags & SWIOTLB_VERBOSE) swiotlb_print_info(); } void __init swiotlb_init(bool addressing_limit, unsigned int flags) { swiotlb_init_remap(addressing_limit, flags, NULL); } /* * Systems with larger DMA zones (those that don't support ISA) can * initialize the swiotlb later using the slab allocator if needed. * This should be just like above, but with some error catching. */ int swiotlb_init_late(size_t size, gfp_t gfp_mask, int (*remap)(void *tlb, unsigned long nslabs)) { struct io_tlb_pool *mem = &io_tlb_default_mem.defpool; unsigned long nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE); unsigned int nareas; unsigned char *vstart = NULL; unsigned int order, area_order; bool retried = false; int rc = 0; if (io_tlb_default_mem.nslabs) return 0; if (swiotlb_force_disable) return 0; io_tlb_default_mem.force_bounce = swiotlb_force_bounce; #ifdef CONFIG_SWIOTLB_DYNAMIC if (!remap) io_tlb_default_mem.can_grow = true; if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp_mask & __GFP_DMA)) io_tlb_default_mem.phys_limit = DMA_BIT_MASK(zone_dma_bits); else if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp_mask & __GFP_DMA32)) io_tlb_default_mem.phys_limit = DMA_BIT_MASK(32); else io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1); #endif if (!default_nareas) swiotlb_adjust_nareas(num_possible_cpus()); retry: order = get_order(nslabs << IO_TLB_SHIFT); nslabs = SLABS_PER_PAGE << order; while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { vstart = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN, order); if (vstart) break; order--; nslabs = SLABS_PER_PAGE << order; retried = true; } if (!vstart) return -ENOMEM; if (remap) rc = remap(vstart, nslabs); if (rc) { free_pages((unsigned long)vstart, order); nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE); if (nslabs < IO_TLB_MIN_SLABS) return rc; retried = true; goto retry; } if (retried) { pr_warn("only able to allocate %ld MB\n", (PAGE_SIZE << order) >> 20); } nareas = limit_nareas(default_nareas, nslabs); area_order = get_order(array_size(sizeof(*mem->areas), nareas)); mem->areas = (struct io_tlb_area *) __get_free_pages(GFP_KERNEL | __GFP_ZERO, area_order); if (!mem->areas) goto error_area; mem->slots = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(array_size(sizeof(*mem->slots), nslabs))); if (!mem->slots) goto error_slots; set_memory_decrypted((unsigned long)vstart, (nslabs << IO_TLB_SHIFT) >> PAGE_SHIFT); swiotlb_init_io_tlb_pool(mem, virt_to_phys(vstart), nslabs, true, nareas); add_mem_pool(&io_tlb_default_mem, mem); swiotlb_print_info(); return 0; error_slots: free_pages((unsigned long)mem->areas, area_order); error_area: free_pages((unsigned long)vstart, order); return -ENOMEM; } void __init swiotlb_exit(void) { struct io_tlb_pool *mem = &io_tlb_default_mem.defpool; unsigned long tbl_vaddr; size_t tbl_size, slots_size; unsigned int area_order; if (swiotlb_force_bounce) return; if (!mem->nslabs) return; pr_info("tearing down default memory pool\n"); tbl_vaddr = (unsigned long)phys_to_virt(mem->start); tbl_size = PAGE_ALIGN(mem->end - mem->start); slots_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), mem->nslabs)); set_memory_encrypted(tbl_vaddr, tbl_size >> PAGE_SHIFT); if (mem->late_alloc) { area_order = get_order(array_size(sizeof(*mem->areas), mem->nareas)); free_pages((unsigned long)mem->areas, area_order); free_pages(tbl_vaddr, get_order(tbl_size)); free_pages((unsigned long)mem->slots, get_order(slots_size)); } else { memblock_free_late(__pa(mem->areas), array_size(sizeof(*mem->areas), mem->nareas)); memblock_free_late(mem->start, tbl_size); memblock_free_late(__pa(mem->slots), slots_size); } memset(mem, 0, sizeof(*mem)); } #ifdef CONFIG_SWIOTLB_DYNAMIC /** * alloc_dma_pages() - allocate pages to be used for DMA * @gfp: GFP flags for the allocation. * @bytes: Size of the buffer. * * Allocate pages from the buddy allocator. If successful, make the allocated * pages decrypted that they can be used for DMA. * * Return: Decrypted pages, or %NULL on failure. */ static struct page *alloc_dma_pages(gfp_t gfp, size_t bytes) { unsigned int order = get_order(bytes); struct page *page; void *vaddr; page = alloc_pages(gfp, order); if (!page) return NULL; vaddr = page_address(page); if (set_memory_decrypted((unsigned long)vaddr, PFN_UP(bytes))) goto error; return page; error: __free_pages(page, order); return NULL; } /** * swiotlb_alloc_tlb() - allocate a dynamic IO TLB buffer * @dev: Device for which a memory pool is allocated. * @bytes: Size of the buffer. * @phys_limit: Maximum allowed physical address of the buffer. * @gfp: GFP flags for the allocation. * * Return: Allocated pages, or %NULL on allocation failure. */ static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes, u64 phys_limit, gfp_t gfp) { struct page *page; /* * Allocate from the atomic pools if memory is encrypted and * the allocation is atomic, because decrypting may block. */ if (!gfpflags_allow_blocking(gfp) && dev && force_dma_unencrypted(dev)) { void *vaddr; if (!IS_ENABLED(CONFIG_DMA_COHERENT_POOL)) return NULL; return dma_alloc_from_pool(dev, bytes, &vaddr, gfp, dma_coherent_ok); } gfp &= ~GFP_ZONEMASK; if (phys_limit <= DMA_BIT_MASK(zone_dma_bits)) gfp |= __GFP_DMA; else if (phys_limit <= DMA_BIT_MASK(32)) gfp |= __GFP_DMA32; while ((page = alloc_dma_pages(gfp, bytes)) && page_to_phys(page) + bytes - 1 > phys_limit) { /* allocated, but too high */ __free_pages(page, get_order(bytes)); if (IS_ENABLED(CONFIG_ZONE_DMA32) && phys_limit < DMA_BIT_MASK(64) && !(gfp & (__GFP_DMA32 | __GFP_DMA))) gfp |= __GFP_DMA32; else if (IS_ENABLED(CONFIG_ZONE_DMA) && !(gfp & __GFP_DMA)) gfp = (gfp & ~__GFP_DMA32) | __GFP_DMA; else return NULL; } return page; } /** * swiotlb_free_tlb() - free a dynamically allocated IO TLB buffer * @vaddr: Virtual address of the buffer. * @bytes: Size of the buffer. */ static void swiotlb_free_tlb(void *vaddr, size_t bytes) { if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) && dma_free_from_pool(NULL, vaddr, bytes)) return; /* Intentional leak if pages cannot be encrypted again. */ if (!set_memory_encrypted((unsigned long)vaddr, PFN_UP(bytes))) __free_pages(virt_to_page(vaddr), get_order(bytes)); } /** * swiotlb_alloc_pool() - allocate a new IO TLB memory pool * @dev: Device for which a memory pool is allocated. * @minslabs: Minimum number of slabs. * @nslabs: Desired (maximum) number of slabs. * @nareas: Number of areas. * @phys_limit: Maximum DMA buffer physical address. * @gfp: GFP flags for the allocations. * * Allocate and initialize a new IO TLB memory pool. The actual number of * slabs may be reduced if allocation of @nslabs fails. If even * @minslabs cannot be allocated, this function fails. * * Return: New memory pool, or %NULL on allocation failure. */ static struct io_tlb_pool *swiotlb_alloc_pool(struct device *dev, unsigned long minslabs, unsigned long nslabs, unsigned int nareas, u64 phys_limit, gfp_t gfp) { struct io_tlb_pool *pool; unsigned int slot_order; struct page *tlb; size_t pool_size; size_t tlb_size; pool_size = sizeof(*pool) + array_size(sizeof(*pool->areas), nareas); pool = kzalloc(pool_size, gfp); if (!pool) goto error; pool->areas = (void *)pool + sizeof(*pool); tlb_size = nslabs << IO_TLB_SHIFT; while (!(tlb = swiotlb_alloc_tlb(dev, tlb_size, phys_limit, gfp))) { if (nslabs <= minslabs) goto error_tlb; nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE); nareas = limit_nareas(nareas, nslabs); tlb_size = nslabs << IO_TLB_SHIFT; } slot_order = get_order(array_size(sizeof(*pool->slots), nslabs)); pool->slots = (struct io_tlb_slot *) __get_free_pages(gfp, slot_order); if (!pool->slots) goto error_slots; swiotlb_init_io_tlb_pool(pool, page_to_phys(tlb), nslabs, true, nareas); return pool; error_slots: swiotlb_free_tlb(page_address(tlb), tlb_size); error_tlb: kfree(pool); error: return NULL; } /** * swiotlb_dyn_alloc() - dynamic memory pool allocation worker * @work: Pointer to dyn_alloc in struct io_tlb_mem. */ static void swiotlb_dyn_alloc(struct work_struct *work) { struct io_tlb_mem *mem = container_of(work, struct io_tlb_mem, dyn_alloc); struct io_tlb_pool *pool; pool = swiotlb_alloc_pool(NULL, IO_TLB_MIN_SLABS, default_nslabs, default_nareas, mem->phys_limit, GFP_KERNEL); if (!pool) { pr_warn_ratelimited("Failed to allocate new pool"); return; } add_mem_pool(mem, pool); } /** * swiotlb_dyn_free() - RCU callback to free a memory pool * @rcu: RCU head in the corresponding struct io_tlb_pool. */ static void swiotlb_dyn_free(struct rcu_head *rcu) { struct io_tlb_pool *pool = container_of(rcu, struct io_tlb_pool, rcu); size_t slots_size = array_size(sizeof(*pool->slots), pool->nslabs); size_t tlb_size = pool->end - pool->start; free_pages((unsigned long)pool->slots, get_order(slots_size)); swiotlb_free_tlb(pool->vaddr, tlb_size); kfree(pool); } /** * swiotlb_find_pool() - find the IO TLB pool for a physical address * @dev: Device which has mapped the DMA buffer. * @paddr: Physical address within the DMA buffer. * * Find the IO TLB memory pool descriptor which contains the given physical * address, if any. * * Return: Memory pool which contains @paddr, or %NULL if none. */ struct io_tlb_pool *swiotlb_find_pool(struct device *dev, phys_addr_t paddr) { struct io_tlb_mem *mem = dev->dma_io_tlb_mem; struct io_tlb_pool *pool; rcu_read_lock(); list_for_each_entry_rcu(pool, &mem->pools, node) { if (paddr >= pool->start && paddr < pool->end) goto out; } list_for_each_entry_rcu(pool, &dev->dma_io_tlb_pools, node) { if (paddr >= pool->start && paddr < pool->end) goto out; } pool = NULL; out: rcu_read_unlock(); return pool; } /** * swiotlb_del_pool() - remove an IO TLB pool from a device * @dev: Owning device. * @pool: Memory pool to be removed. */ static void swiotlb_del_pool(struct device *dev, struct io_tlb_pool *pool) { unsigned long flags; spin_lock_irqsave(&dev->dma_io_tlb_lock, flags); list_del_rcu(&pool->node); spin_unlock_irqrestore(&dev->dma_io_tlb_lock, flags); call_rcu(&pool->rcu, swiotlb_dyn_free); } #endif /* CONFIG_SWIOTLB_DYNAMIC */ /** * swiotlb_dev_init() - initialize swiotlb fields in &struct device * @dev: Device to be initialized. */ void swiotlb_dev_init(struct device *dev) { dev->dma_io_tlb_mem = &io_tlb_default_mem; #ifdef CONFIG_SWIOTLB_DYNAMIC INIT_LIST_HEAD(&dev->dma_io_tlb_pools); spin_lock_init(&dev->dma_io_tlb_lock); dev->dma_uses_io_tlb = false; #endif } /* * Return the offset into a iotlb slot required to keep the device happy. */ static unsigned int swiotlb_align_offset(struct device *dev, u64 addr) { return addr & dma_get_min_align_mask(dev) & (IO_TLB_SIZE - 1); } /* * Bounce: copy the swiotlb buffer from or back to the original dma location */ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size, enum dma_data_direction dir) { struct io_tlb_pool *mem = swiotlb_find_pool(dev, tlb_addr); int index = (tlb_addr - mem->start) >> IO_TLB_SHIFT; phys_addr_t orig_addr = mem->slots[index].orig_addr; size_t alloc_size = mem->slots[index].alloc_size; unsigned long pfn = PFN_DOWN(orig_addr); unsigned char *vaddr = mem->vaddr + tlb_addr - mem->start; unsigned int tlb_offset, orig_addr_offset; if (orig_addr == INVALID_PHYS_ADDR) return; tlb_offset = tlb_addr & (IO_TLB_SIZE - 1); orig_addr_offset = swiotlb_align_offset(dev, orig_addr); if (tlb_offset < orig_addr_offset) { dev_WARN_ONCE(dev, 1, "Access before mapping start detected. orig offset %u, requested offset %u.\n", orig_addr_offset, tlb_offset); return; } tlb_offset -= orig_addr_offset; if (tlb_offset > alloc_size) { dev_WARN_ONCE(dev, 1, "Buffer overflow detected. Allocation size: %zu. Mapping size: %zu+%u.\n", alloc_size, size, tlb_offset); return; } orig_addr += tlb_offset; alloc_size -= tlb_offset; if (size > alloc_size) { dev_WARN_ONCE(dev, 1, "Buffer overflow detected. Allocation size: %zu. Mapping size: %zu.\n", alloc_size, size); size = alloc_size; } if (PageHighMem(pfn_to_page(pfn))) { unsigned int offset = orig_addr & ~PAGE_MASK; struct page *page; unsigned int sz = 0; unsigned long flags; while (size) { sz = min_t(size_t, PAGE_SIZE - offset, size); local_irq_save(flags); page = pfn_to_page(pfn); if (dir == DMA_TO_DEVICE) memcpy_from_page(vaddr, page, offset, sz); else memcpy_to_page(page, offset, vaddr, sz); local_irq_restore(flags); size -= sz; pfn++; vaddr += sz; offset = 0; } } else if (dir == DMA_TO_DEVICE) { memcpy(vaddr, phys_to_virt(orig_addr), size); } else { memcpy(phys_to_virt(orig_addr), vaddr, size); } } static inline phys_addr_t slot_addr(phys_addr_t start, phys_addr_t idx) { return start + (idx << IO_TLB_SHIFT); } /* * Carefully handle integer overflow which can occur when boundary_mask == ~0UL. */ static inline unsigned long get_max_slots(unsigned long boundary_mask) { return (boundary_mask >> IO_TLB_SHIFT) + 1; } static unsigned int wrap_area_index(struct io_tlb_pool *mem, unsigned int index) { if (index >= mem->area_nslabs) return 0; return index; } /* * Track the total used slots with a global atomic value in order to have * correct information to determine the high water mark. The mem_used() * function gives imprecise results because there's no locking across * multiple areas. */ #ifdef CONFIG_DEBUG_FS static void inc_used_and_hiwater(struct io_tlb_mem *mem, unsigned int nslots) { unsigned long old_hiwater, new_used; new_used = atomic_long_add_return(nslots, &mem->total_used); old_hiwater = atomic_long_read(&mem->used_hiwater); do { if (new_used <= old_hiwater) break; } while (!atomic_long_try_cmpxchg(&mem->used_hiwater, &old_hiwater, new_used)); } static void dec_used(struct io_tlb_mem *mem, unsigned int nslots) { atomic_long_sub(nslots, &mem->total_used); } #else /* !CONFIG_DEBUG_FS */ static void inc_used_and_hiwater(struct io_tlb_mem *mem, unsigned int nslots) { } static void dec_used(struct io_tlb_mem *mem, unsigned int nslots) { } #endif /* CONFIG_DEBUG_FS */ /** * swiotlb_area_find_slots() - search for slots in one IO TLB memory area * @dev: Device which maps the buffer. * @pool: Memory pool to be searched. * @area_index: Index of the IO TLB memory area to be searched. * @orig_addr: Original (non-bounced) IO buffer address. * @alloc_size: Total requested size of the bounce buffer, * including initial alignment padding. * @alloc_align_mask: Required alignment of the allocated buffer. * * Find a suitable sequence of IO TLB entries for the request and allocate * a buffer from the given IO TLB memory area. * This function takes care of locking. * * Return: Index of the first allocated slot, or -1 on error. */ static int swiotlb_area_find_slots(struct device *dev, struct io_tlb_pool *pool, int area_index, phys_addr_t orig_addr, size_t alloc_size, unsigned int alloc_align_mask) { struct io_tlb_area *area = pool->areas + area_index; unsigned long boundary_mask = dma_get_seg_boundary(dev); dma_addr_t tbl_dma_addr = phys_to_dma_unencrypted(dev, pool->start) & boundary_mask; unsigned long max_slots = get_max_slots(boundary_mask); unsigned int iotlb_align_mask = dma_get_min_align_mask(dev) | alloc_align_mask; unsigned int nslots = nr_slots(alloc_size), stride; unsigned int offset = swiotlb_align_offset(dev, orig_addr); unsigned int index, slots_checked, count = 0, i; unsigned long flags; unsigned int slot_base; unsigned int slot_index; BUG_ON(!nslots); BUG_ON(area_index >= pool->nareas); /* * For allocations of PAGE_SIZE or larger only look for page aligned * allocations. */ if (alloc_size >= PAGE_SIZE) iotlb_align_mask |= ~PAGE_MASK; iotlb_align_mask &= ~(IO_TLB_SIZE - 1); /* * For mappings with an alignment requirement don't bother looping to * unaligned slots once we found an aligned one. */ stride = (iotlb_align_mask >> IO_TLB_SHIFT) + 1; spin_lock_irqsave(&area->lock, flags); if (unlikely(nslots > pool->area_nslabs - area->used)) goto not_found; slot_base = area_index * pool->area_nslabs; index = area->index; for (slots_checked = 0; slots_checked < pool->area_nslabs; ) { slot_index = slot_base + index; if (orig_addr && (slot_addr(tbl_dma_addr, slot_index) & iotlb_align_mask) != (orig_addr & iotlb_align_mask)) { index = wrap_area_index(pool, index + 1); slots_checked++; continue; } if (!iommu_is_span_boundary(slot_index, nslots, nr_slots(tbl_dma_addr), max_slots)) { if (pool->slots[slot_index].list >= nslots) goto found; } index = wrap_area_index(pool, index + stride); slots_checked += stride; } not_found: spin_unlock_irqrestore(&area->lock, flags); return -1; found: /* * If we find a slot that indicates we have 'nslots' number of * contiguous buffers, we allocate the buffers from that slot onwards * and set the list of free entries to '0' indicating unavailable. */ for (i = slot_index; i < slot_index + nslots; i++) { pool->slots[i].list = 0; pool->slots[i].alloc_size = alloc_size - (offset + ((i - slot_index) << IO_TLB_SHIFT)); } for (i = slot_index - 1; io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && pool->slots[i].list; i--) pool->slots[i].list = ++count; /* * Update the indices to avoid searching in the next round. */ area->index = wrap_area_index(pool, index + nslots); area->used += nslots; spin_unlock_irqrestore(&area->lock, flags); inc_used_and_hiwater(dev->dma_io_tlb_mem, nslots); return slot_index; } /** * swiotlb_pool_find_slots() - search for slots in one memory pool * @dev: Device which maps the buffer. * @pool: Memory pool to be searched. * @orig_addr: Original (non-bounced) IO buffer address. * @alloc_size: Total requested size of the bounce buffer, * including initial alignment padding. * @alloc_align_mask: Required alignment of the allocated buffer. * * Search through one memory pool to find a sequence of slots that match the * allocation constraints. * * Return: Index of the first allocated slot, or -1 on error. */ static int swiotlb_pool_find_slots(struct device *dev, struct io_tlb_pool *pool, phys_addr_t orig_addr, size_t alloc_size, unsigned int alloc_align_mask) { int start = raw_smp_processor_id() & (pool->nareas - 1); int i = start, index; do { index = swiotlb_area_find_slots(dev, pool, i, orig_addr, alloc_size, alloc_align_mask); if (index >= 0) return index; if (++i >= pool->nareas) i = 0; } while (i != start); return -1; } #ifdef CONFIG_SWIOTLB_DYNAMIC /** * swiotlb_find_slots() - search for slots in the whole swiotlb * @dev: Device which maps the buffer. * @orig_addr: Original (non-bounced) IO buffer address. * @alloc_size: Total requested size of the bounce buffer, * including initial alignment padding. * @alloc_align_mask: Required alignment of the allocated buffer. * @retpool: Used memory pool, updated on return. * * Search through the whole software IO TLB to find a sequence of slots that * match the allocation constraints. * * Return: Index of the first allocated slot, or -1 on error. */ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, size_t alloc_size, unsigned int alloc_align_mask, struct io_tlb_pool **retpool) { struct io_tlb_mem *mem = dev->dma_io_tlb_mem; struct io_tlb_pool *pool; unsigned long nslabs; unsigned long flags; u64 phys_limit; int index; rcu_read_lock(); list_for_each_entry_rcu(pool, &mem->pools, node) { index = swiotlb_pool_find_slots(dev, pool, orig_addr, alloc_size, alloc_align_mask); if (index >= 0) { rcu_read_unlock(); goto found; } } rcu_read_unlock(); if (!mem->can_grow) return -1; schedule_work(&mem->dyn_alloc); nslabs = nr_slots(alloc_size); phys_limit = min_not_zero(*dev->dma_mask, dev->bus_dma_limit); pool = swiotlb_alloc_pool(dev, nslabs, nslabs, 1, phys_limit, GFP_NOWAIT | __GFP_NOWARN); if (!pool) return -1; index = swiotlb_pool_find_slots(dev, pool, orig_addr, alloc_size, alloc_align_mask); if (index < 0) { swiotlb_dyn_free(&pool->rcu); return -1; } pool->transient = true; spin_lock_irqsave(&dev->dma_io_tlb_lock, flags); list_add_rcu(&pool->node, &dev->dma_io_tlb_pools); spin_unlock_irqrestore(&dev->dma_io_tlb_lock, flags); found: WRITE_ONCE(dev->dma_uses_io_tlb, true); /* * The general barrier orders reads and writes against a presumed store * of the SWIOTLB buffer address by a device driver (to a driver private * data structure). It serves two purposes. * * First, the store to dev->dma_uses_io_tlb must be ordered before the * presumed store. This guarantees that the returned buffer address * cannot be passed to another CPU before updating dev->dma_uses_io_tlb. * * Second, the load from mem->pools must be ordered before the same * presumed store. This guarantees that the returned buffer address * cannot be observed by another CPU before an update of the RCU list * that was made by swiotlb_dyn_alloc() on a third CPU (cf. multicopy * atomicity). * * See also the comment in is_swiotlb_buffer(). */ smp_mb(); *retpool = pool; return index; } #else /* !CONFIG_SWIOTLB_DYNAMIC */ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, size_t alloc_size, unsigned int alloc_align_mask, struct io_tlb_pool **retpool) { *retpool = &dev->dma_io_tlb_mem->defpool; return swiotlb_pool_find_slots(dev, *retpool, orig_addr, alloc_size, alloc_align_mask); } #endif /* CONFIG_SWIOTLB_DYNAMIC */ #ifdef CONFIG_DEBUG_FS /** * mem_used() - get number of used slots in an allocator * @mem: Software IO TLB allocator. * * The result is accurate in this version of the function, because an atomic * counter is available if CONFIG_DEBUG_FS is set. * * Return: Number of used slots. */ static unsigned long mem_used(struct io_tlb_mem *mem) { return atomic_long_read(&mem->total_used); } #else /* !CONFIG_DEBUG_FS */ /** * mem_pool_used() - get number of used slots in a memory pool * @pool: Software IO TLB memory pool. * * The result is not accurate, see mem_used(). * * Return: Approximate number of used slots. */ static unsigned long mem_pool_used(struct io_tlb_pool *pool) { int i; unsigned long used = 0; for (i = 0; i < pool->nareas; i++) used += pool->areas[i].used; return used; } /** * mem_used() - get number of used slots in an allocator * @mem: Software IO TLB allocator. * * The result is not accurate, because there is no locking of individual * areas. * * Return: Approximate number of used slots. */ static unsigned long mem_used(struct io_tlb_mem *mem) { #ifdef CONFIG_SWIOTLB_DYNAMIC struct io_tlb_pool *pool; unsigned long used = 0; rcu_read_lock(); list_for_each_entry_rcu(pool, &mem->pools, node) used += mem_pool_used(pool); rcu_read_unlock(); return used; #else return mem_pool_used(&mem->defpool); #endif } #endif /* CONFIG_DEBUG_FS */ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, size_t mapping_size, size_t alloc_size, unsigned int alloc_align_mask, enum dma_data_direction dir, unsigned long attrs) { struct io_tlb_mem *mem = dev->dma_io_tlb_mem; unsigned int offset = swiotlb_align_offset(dev, orig_addr); struct io_tlb_pool *pool; unsigned int i; int index; phys_addr_t tlb_addr; if (!mem || !mem->nslabs) { dev_warn_ratelimited(dev, "Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); return (phys_addr_t)DMA_MAPPING_ERROR; } if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n"); if (mapping_size > alloc_size) { dev_warn_once(dev, "Invalid sizes (mapping: %zd bytes, alloc: %zd bytes)", mapping_size, alloc_size); return (phys_addr_t)DMA_MAPPING_ERROR; } index = swiotlb_find_slots(dev, orig_addr, alloc_size + offset, alloc_align_mask, &pool); if (index == -1) { if (!(attrs & DMA_ATTR_NO_WARN)) dev_warn_ratelimited(dev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n", alloc_size, mem->nslabs, mem_used(mem)); return (phys_addr_t)DMA_MAPPING_ERROR; } /* * Save away the mapping from the original address to the DMA address. * This is needed when we sync the memory. Then we sync the buffer if * needed. */ for (i = 0; i < nr_slots(alloc_size + offset); i++) pool->slots[index + i].orig_addr = slot_addr(orig_addr, i); tlb_addr = slot_addr(pool->start, index) + offset; /* * When dir == DMA_FROM_DEVICE we could omit the copy from the orig * to the tlb buffer, if we knew for sure the device will * overwrite the entire current content. But we don't. Thus * unconditional bounce may prevent leaking swiotlb content (i.e. * kernel memory) to user-space. */ swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE); return tlb_addr; } static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr) { struct io_tlb_pool *mem = swiotlb_find_pool(dev, tlb_addr); unsigned long flags; unsigned int offset = swiotlb_align_offset(dev, tlb_addr); int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT; int nslots = nr_slots(mem->slots[index].alloc_size + offset); int aindex = index / mem->area_nslabs; struct io_tlb_area *area = &mem->areas[aindex]; int count, i; /* * Return the buffer to the free list by setting the corresponding * entries to indicate the number of contiguous entries available. * While returning the entries to the free list, we merge the entries * with slots below and above the pool being returned. */ BUG_ON(aindex >= mem->nareas); spin_lock_irqsave(&area->lock, flags); if (index + nslots < ALIGN(index + 1, IO_TLB_SEGSIZE)) count = mem->slots[index + nslots].list; else count = 0; /* * Step 1: return the slots to the free list, merging the slots with * superceeding slots */ for (i = index + nslots - 1; i >= index; i--) { mem->slots[i].list = ++count; mem->slots[i].orig_addr = INVALID_PHYS_ADDR; mem->slots[i].alloc_size = 0; } /* * Step 2: merge the returned slots with the preceding slots, if * available (non zero) */ for (i = index - 1; io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && mem->slots[i].list; i--) mem->slots[i].list = ++count; area->used -= nslots; spin_unlock_irqrestore(&area->lock, flags); dec_used(dev->dma_io_tlb_mem, nslots); } #ifdef CONFIG_SWIOTLB_DYNAMIC /** * swiotlb_del_transient() - delete a transient memory pool * @dev: Device which mapped the buffer. * @tlb_addr: Physical address within a bounce buffer. * * Check whether the address belongs to a transient SWIOTLB memory pool. * If yes, then delete the pool. * * Return: %true if @tlb_addr belonged to a transient pool that was released. */ static bool swiotlb_del_transient(struct device *dev, phys_addr_t tlb_addr) { struct io_tlb_pool *pool; pool = swiotlb_find_pool(dev, tlb_addr); if (!pool->transient) return false; dec_used(dev->dma_io_tlb_mem, pool->nslabs); swiotlb_del_pool(dev, pool); return true; } #else /* !CONFIG_SWIOTLB_DYNAMIC */ static inline bool swiotlb_del_transient(struct device *dev, phys_addr_t tlb_addr) { return false; } #endif /* CONFIG_SWIOTLB_DYNAMIC */ /* * tlb_addr is the physical address of the bounce buffer to unmap. */ void swiotlb_tbl_unmap_single(struct device *dev, phys_addr_t tlb_addr, size_t mapping_size, enum dma_data_direction dir, unsigned long attrs) { /* * First, sync the memory before unmapping the entry */ if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)) swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_FROM_DEVICE); if (swiotlb_del_transient(dev, tlb_addr)) return; swiotlb_release_slots(dev, tlb_addr); } void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr, size_t size, enum dma_data_direction dir) { if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE); else BUG_ON(dir != DMA_FROM_DEVICE); } void swiotlb_sync_single_for_cpu(struct device *dev, phys_addr_t tlb_addr, size_t size, enum dma_data_direction dir) { if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) swiotlb_bounce(dev, tlb_addr, size, DMA_FROM_DEVICE); else BUG_ON(dir != DMA_TO_DEVICE); } /* * Create a swiotlb mapping for the buffer at @paddr, and in case of DMAing * to the device copy the data into it as well. */ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size, enum dma_data_direction dir, unsigned long attrs) { phys_addr_t swiotlb_addr; dma_addr_t dma_addr; trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size); swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, size, 0, dir, attrs); if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR) return DMA_MAPPING_ERROR; /* Ensure that the address returned is DMA'ble */ dma_addr = phys_to_dma_unencrypted(dev, swiotlb_addr); if (unlikely(!dma_capable(dev, dma_addr, size, true))) { swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); dev_WARN_ONCE(dev, 1, "swiotlb addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); return DMA_MAPPING_ERROR; } if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) arch_sync_dma_for_device(swiotlb_addr, size, dir); return dma_addr; } size_t swiotlb_max_mapping_size(struct device *dev) { int min_align_mask = dma_get_min_align_mask(dev); int min_align = 0; /* * swiotlb_find_slots() skips slots according to * min align mask. This affects max mapping size. * Take it into acount here. */ if (min_align_mask) min_align = roundup(min_align_mask, IO_TLB_SIZE); return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE - min_align; } /** * is_swiotlb_allocated() - check if the default software IO TLB is initialized */ bool is_swiotlb_allocated(void) { return io_tlb_default_mem.nslabs; } bool is_swiotlb_active(struct device *dev) { struct io_tlb_mem *mem = dev->dma_io_tlb_mem; return mem && mem->nslabs; } /** * default_swiotlb_base() - get the base address of the default SWIOTLB * * Get the lowest physical address used by the default software IO TLB pool. */ phys_addr_t default_swiotlb_base(void) { #ifdef CONFIG_SWIOTLB_DYNAMIC io_tlb_default_mem.can_grow = false; #endif return io_tlb_default_mem.defpool.start; } /** * default_swiotlb_limit() - get the address limit of the default SWIOTLB * * Get the highest physical address used by the default software IO TLB pool. */ phys_addr_t default_swiotlb_limit(void) { #ifdef CONFIG_SWIOTLB_DYNAMIC return io_tlb_default_mem.phys_limit; #else return io_tlb_default_mem.defpool.end - 1; #endif } #ifdef CONFIG_DEBUG_FS static int io_tlb_used_get(void *data, u64 *val) { struct io_tlb_mem *mem = data; *val = mem_used(mem); return 0; } static int io_tlb_hiwater_get(void *data, u64 *val) { struct io_tlb_mem *mem = data; *val = atomic_long_read(&mem->used_hiwater); return 0; } static int io_tlb_hiwater_set(void *data, u64 val) { struct io_tlb_mem *mem = data; /* Only allow setting to zero */ if (val != 0) return -EINVAL; atomic_long_set(&mem->used_hiwater, val); return 0; } DEFINE_DEBUGFS_ATTRIBUTE(fops_io_tlb_used, io_tlb_used_get, NULL, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_io_tlb_hiwater, io_tlb_hiwater_get, io_tlb_hiwater_set, "%llu\n"); static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem, const char *dirname) { atomic_long_set(&mem->total_used, 0); atomic_long_set(&mem->used_hiwater, 0); mem->debugfs = debugfs_create_dir(dirname, io_tlb_default_mem.debugfs); if (!mem->nslabs) return; debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs); debugfs_create_file("io_tlb_used", 0400, mem->debugfs, mem, &fops_io_tlb_used); debugfs_create_file("io_tlb_used_hiwater", 0600, mem->debugfs, mem, &fops_io_tlb_hiwater); } static int __init swiotlb_create_default_debugfs(void) { swiotlb_create_debugfs_files(&io_tlb_default_mem, "swiotlb"); return 0; } late_initcall(swiotlb_create_default_debugfs); #else /* !CONFIG_DEBUG_FS */ static inline void swiotlb_create_debugfs_files(struct io_tlb_mem *mem, const char *dirname) { } #endif /* CONFIG_DEBUG_FS */ #ifdef CONFIG_DMA_RESTRICTED_POOL struct page *swiotlb_alloc(struct device *dev, size_t size) { struct io_tlb_mem *mem = dev->dma_io_tlb_mem; struct io_tlb_pool *pool; phys_addr_t tlb_addr; int index; if (!mem) return NULL; index = swiotlb_find_slots(dev, 0, size, 0, &pool); if (index == -1) return NULL; tlb_addr = slot_addr(pool->start, index); return pfn_to_page(PFN_DOWN(tlb_addr)); } bool swiotlb_free(struct device *dev, struct page *page, size_t size) { phys_addr_t tlb_addr = page_to_phys(page); if (!is_swiotlb_buffer(dev, tlb_addr)) return false; swiotlb_release_slots(dev, tlb_addr); return true; } static int rmem_swiotlb_device_init(struct reserved_mem *rmem, struct device *dev) { struct io_tlb_mem *mem = rmem->priv; unsigned long nslabs = rmem->size >> IO_TLB_SHIFT; /* Set Per-device io tlb area to one */ unsigned int nareas = 1; if (PageHighMem(pfn_to_page(PHYS_PFN(rmem->base)))) { dev_err(dev, "Restricted DMA pool must be accessible within the linear mapping."); return -EINVAL; } /* * Since multiple devices can share the same pool, the private data, * io_tlb_mem struct, will be initialized by the first device attached * to it. */ if (!mem) { struct io_tlb_pool *pool; mem = kzalloc(sizeof(*mem), GFP_KERNEL); if (!mem) return -ENOMEM; pool = &mem->defpool; pool->slots = kcalloc(nslabs, sizeof(*pool->slots), GFP_KERNEL); if (!pool->slots) { kfree(mem); return -ENOMEM; } pool->areas = kcalloc(nareas, sizeof(*pool->areas), GFP_KERNEL); if (!pool->areas) { kfree(pool->slots); kfree(mem); return -ENOMEM; } set_memory_decrypted((unsigned long)phys_to_virt(rmem->base), rmem->size >> PAGE_SHIFT); swiotlb_init_io_tlb_pool(pool, rmem->base, nslabs, false, nareas); mem->force_bounce = true; mem->for_alloc = true; #ifdef CONFIG_SWIOTLB_DYNAMIC spin_lock_init(&mem->lock); #endif add_mem_pool(mem, pool); rmem->priv = mem; swiotlb_create_debugfs_files(mem, rmem->name); } dev->dma_io_tlb_mem = mem; return 0; } static void rmem_swiotlb_device_release(struct reserved_mem *rmem, struct device *dev) { dev->dma_io_tlb_mem = &io_tlb_default_mem; } static const struct reserved_mem_ops rmem_swiotlb_ops = { .device_init = rmem_swiotlb_device_init, .device_release = rmem_swiotlb_device_release, }; static int __init rmem_swiotlb_setup(struct reserved_mem *rmem) { unsigned long node = rmem->fdt_node; if (of_get_flat_dt_prop(node, "reusable", NULL) || of_get_flat_dt_prop(node, "linux,cma-default", NULL) || of_get_flat_dt_prop(node, "linux,dma-default", NULL) || of_get_flat_dt_prop(node, "no-map", NULL)) return -EINVAL; rmem->ops = &rmem_swiotlb_ops; pr_info("Reserved memory: created restricted DMA pool at %pa, size %ld MiB\n", &rmem->base, (unsigned long)rmem->size / SZ_1M); return 0; } RESERVEDMEM_OF_DECLARE(dma, "restricted-dma-pool", rmem_swiotlb_setup); #endif /* CONFIG_DMA_RESTRICTED_POOL */ |
288 288 287 287 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 | // SPDX-License-Identifier: GPL-2.0-only /* * Common framework for low-level network console, dump, and debugger code * * Sep 8 2003 Matt Mackall <mpm@selenic.com> * * based on the netconsole code from: * * Copyright (C) 2001 Ingo Molnar <mingo@redhat.com> * Copyright (C) 2002 Red Hat, Inc. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/moduleparam.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/string.h> #include <linux/if_arp.h> #include <linux/inetdevice.h> #include <linux/inet.h> #include <linux/interrupt.h> #include <linux/netpoll.h> #include <linux/sched.h> #include <linux/delay.h> #include <linux/rcupdate.h> #include <linux/workqueue.h> #include <linux/slab.h> #include <linux/export.h> #include <linux/if_vlan.h> #include <net/tcp.h> #include <net/udp.h> #include <net/addrconf.h> #include <net/ndisc.h> #include <net/ip6_checksum.h> #include <asm/unaligned.h> #include <trace/events/napi.h> #include <linux/kconfig.h> /* * We maintain a small pool of fully-sized skbs, to make sure the * message gets out even in extreme OOM situations. */ #define MAX_UDP_CHUNK 1460 #define MAX_SKBS 32 static struct sk_buff_head skb_pool; DEFINE_STATIC_SRCU(netpoll_srcu); #define USEC_PER_POLL 50 #define MAX_SKB_SIZE \ (sizeof(struct ethhdr) + \ sizeof(struct iphdr) + \ sizeof(struct udphdr) + \ MAX_UDP_CHUNK) static void zap_completion_queue(void); static unsigned int carrier_timeout = 4; module_param(carrier_timeout, uint, 0644); #define np_info(np, fmt, ...) \ pr_info("%s: " fmt, np->name, ##__VA_ARGS__) #define np_err(np, fmt, ...) \ pr_err("%s: " fmt, np->name, ##__VA_ARGS__) #define np_notice(np, fmt, ...) \ pr_notice("%s: " fmt, np->name, ##__VA_ARGS__) static netdev_tx_t netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) { netdev_tx_t status = NETDEV_TX_OK; netdev_features_t features; features = netif_skb_features(skb); if (skb_vlan_tag_present(skb) && !vlan_hw_offload_capable(features, skb->vlan_proto)) { skb = __vlan_hwaccel_push_inside(skb); if (unlikely(!skb)) { /* This is actually a packet drop, but we * don't want the code that calls this * function to try and operate on a NULL skb. */ goto out; } } status = netdev_start_xmit(skb, dev, txq, false); out: return status; } static void queue_process(struct work_struct *work) { struct netpoll_info *npinfo = container_of(work, struct netpoll_info, tx_work.work); struct sk_buff *skb; unsigned long flags; while ((skb = skb_dequeue(&npinfo->txq))) { struct net_device *dev = skb->dev; struct netdev_queue *txq; unsigned int q_index; if (!netif_device_present(dev) || !netif_running(dev)) { kfree_skb(skb); continue; } local_irq_save(flags); /* check if skb->queue_mapping is still valid */ q_index = skb_get_queue_mapping(skb); if (unlikely(q_index >= dev->real_num_tx_queues)) { q_index = q_index % dev->real_num_tx_queues; skb_set_queue_mapping(skb, q_index); } txq = netdev_get_tx_queue(dev, q_index); HARD_TX_LOCK(dev, txq, smp_processor_id()); if (netif_xmit_frozen_or_stopped(txq) || !dev_xmit_complete(netpoll_start_xmit(skb, dev, txq))) { skb_queue_head(&npinfo->txq, skb); HARD_TX_UNLOCK(dev, txq); local_irq_restore(flags); schedule_delayed_work(&npinfo->tx_work, HZ/10); return; } HARD_TX_UNLOCK(dev, txq); local_irq_restore(flags); } } static int netif_local_xmit_active(struct net_device *dev) { int i; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); if (READ_ONCE(txq->xmit_lock_owner) == smp_processor_id()) return 1; } return 0; } static void poll_one_napi(struct napi_struct *napi) { int work; /* If we set this bit but see that it has already been set, * that indicates that napi has been disabled and we need * to abort this operation */ if (test_and_set_bit(NAPI_STATE_NPSVC, &napi->state)) return; /* We explicilty pass the polling call a budget of 0 to * indicate that we are clearing the Tx path only. */ work = napi->poll(napi, 0); WARN_ONCE(work, "%pS exceeded budget in poll\n", napi->poll); trace_napi_poll(napi, work, 0); clear_bit(NAPI_STATE_NPSVC, &napi->state); } static void poll_napi(struct net_device *dev) { struct napi_struct *napi; int cpu = smp_processor_id(); list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) { if (cmpxchg(&napi->poll_owner, -1, cpu) == -1) { poll_one_napi(napi); smp_store_release(&napi->poll_owner, -1); } } } void netpoll_poll_dev(struct net_device *dev) { struct netpoll_info *ni = rcu_dereference_bh(dev->npinfo); const struct net_device_ops *ops; /* Don't do any rx activity if the dev_lock mutex is held * the dev_open/close paths use this to block netpoll activity * while changing device state */ if (!ni || down_trylock(&ni->dev_lock)) return; /* Some drivers will take the same locks in poll and xmit, * we can't poll if local CPU is already in xmit. */ if (!netif_running(dev) || netif_local_xmit_active(dev)) { up(&ni->dev_lock); return; } ops = dev->netdev_ops; if (ops->ndo_poll_controller) ops->ndo_poll_controller(dev); poll_napi(dev); up(&ni->dev_lock); zap_completion_queue(); } EXPORT_SYMBOL(netpoll_poll_dev); void netpoll_poll_disable(struct net_device *dev) { struct netpoll_info *ni; int idx; might_sleep(); idx = srcu_read_lock(&netpoll_srcu); ni = srcu_dereference(dev->npinfo, &netpoll_srcu); if (ni) down(&ni->dev_lock); srcu_read_unlock(&netpoll_srcu, idx); } EXPORT_SYMBOL(netpoll_poll_disable); void netpoll_poll_enable(struct net_device *dev) { struct netpoll_info *ni; rcu_read_lock(); ni = rcu_dereference(dev->npinfo); if (ni) up(&ni->dev_lock); rcu_read_unlock(); } EXPORT_SYMBOL(netpoll_poll_enable); static void refill_skbs(void) { struct sk_buff *skb; unsigned long flags; spin_lock_irqsave(&skb_pool.lock, flags); while (skb_pool.qlen < MAX_SKBS) { skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC); if (!skb) break; __skb_queue_tail(&skb_pool, skb); } spin_unlock_irqrestore(&skb_pool.lock, flags); } static void zap_completion_queue(void) { unsigned long flags; struct softnet_data *sd = &get_cpu_var(softnet_data); if (sd->completion_queue) { struct sk_buff *clist; local_irq_save(flags); clist = sd->completion_queue; sd->completion_queue = NULL; local_irq_restore(flags); while (clist != NULL) { struct sk_buff *skb = clist; clist = clist->next; if (!skb_irq_freeable(skb)) { refcount_set(&skb->users, 1); dev_kfree_skb_any(skb); /* put this one back */ } else { __kfree_skb(skb); } } } put_cpu_var(softnet_data); } static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve) { int count = 0; struct sk_buff *skb; zap_completion_queue(); refill_skbs(); repeat: skb = alloc_skb(len, GFP_ATOMIC); if (!skb) skb = skb_dequeue(&skb_pool); if (!skb) { if (++count < 10) { netpoll_poll_dev(np->dev); goto repeat; } return NULL; } refcount_set(&skb->users, 1); skb_reserve(skb, reserve); return skb; } static int netpoll_owner_active(struct net_device *dev) { struct napi_struct *napi; list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) { if (napi->poll_owner == smp_processor_id()) return 1; } return 0; } /* call with IRQ disabled */ static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) { netdev_tx_t status = NETDEV_TX_BUSY; struct net_device *dev; unsigned long tries; /* It is up to the caller to keep npinfo alive. */ struct netpoll_info *npinfo; lockdep_assert_irqs_disabled(); dev = np->dev; npinfo = rcu_dereference_bh(dev->npinfo); if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) { dev_kfree_skb_irq(skb); return NET_XMIT_DROP; } /* don't get messages out of order, and no recursion */ if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) { struct netdev_queue *txq; txq = netdev_core_pick_tx(dev, skb, NULL); /* try until next clock tick */ for (tries = jiffies_to_usecs(1)/USEC_PER_POLL; tries > 0; --tries) { if (HARD_TX_TRYLOCK(dev, txq)) { if (!netif_xmit_stopped(txq)) status = netpoll_start_xmit(skb, dev, txq); HARD_TX_UNLOCK(dev, txq); if (dev_xmit_complete(status)) break; } /* tickle device maybe there is some cleanup */ netpoll_poll_dev(np->dev); udelay(USEC_PER_POLL); } WARN_ONCE(!irqs_disabled(), "netpoll_send_skb_on_dev(): %s enabled interrupts in poll (%pS)\n", dev->name, dev->netdev_ops->ndo_start_xmit); } if (!dev_xmit_complete(status)) { skb_queue_tail(&npinfo->txq, skb); schedule_delayed_work(&npinfo->tx_work,0); } return NETDEV_TX_OK; } netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) { unsigned long flags; netdev_tx_t ret; if (unlikely(!np)) { dev_kfree_skb_irq(skb); ret = NET_XMIT_DROP; } else { local_irq_save(flags); ret = __netpoll_send_skb(np, skb); local_irq_restore(flags); } return ret; } EXPORT_SYMBOL(netpoll_send_skb); void netpoll_send_udp(struct netpoll *np, const char *msg, int len) { int total_len, ip_len, udp_len; struct sk_buff *skb; struct udphdr *udph; struct iphdr *iph; struct ethhdr *eth; static atomic_t ip_ident; struct ipv6hdr *ip6h; if (!IS_ENABLED(CONFIG_PREEMPT_RT)) WARN_ON_ONCE(!irqs_disabled()); udp_len = len + sizeof(*udph); if (np->ipv6) ip_len = udp_len + sizeof(*ip6h); else ip_len = udp_len + sizeof(*iph); total_len = ip_len + LL_RESERVED_SPACE(np->dev); skb = find_skb(np, total_len + np->dev->needed_tailroom, total_len - len); if (!skb) return; skb_copy_to_linear_data(skb, msg, len); skb_put(skb, len); skb_push(skb, sizeof(*udph)); skb_reset_transport_header(skb); udph = udp_hdr(skb); udph->source = htons(np->local_port); udph->dest = htons(np->remote_port); udph->len = htons(udp_len); if (np->ipv6) { udph->check = 0; udph->check = csum_ipv6_magic(&np->local_ip.in6, &np->remote_ip.in6, udp_len, IPPROTO_UDP, csum_partial(udph, udp_len, 0)); if (udph->check == 0) udph->check = CSUM_MANGLED_0; skb_push(skb, sizeof(*ip6h)); skb_reset_network_header(skb); ip6h = ipv6_hdr(skb); /* ip6h->version = 6; ip6h->priority = 0; */ *(unsigned char *)ip6h = 0x60; ip6h->flow_lbl[0] = 0; ip6h->flow_lbl[1] = 0; ip6h->flow_lbl[2] = 0; ip6h->payload_len = htons(sizeof(struct udphdr) + len); ip6h->nexthdr = IPPROTO_UDP; ip6h->hop_limit = 32; ip6h->saddr = np->local_ip.in6; ip6h->daddr = np->remote_ip.in6; eth = skb_push(skb, ETH_HLEN); skb_reset_mac_header(skb); skb->protocol = eth->h_proto = htons(ETH_P_IPV6); } else { udph->check = 0; udph->check = csum_tcpudp_magic(np->local_ip.ip, np->remote_ip.ip, udp_len, IPPROTO_UDP, csum_partial(udph, udp_len, 0)); if (udph->check == 0) udph->check = CSUM_MANGLED_0; skb_push(skb, sizeof(*iph)); skb_reset_network_header(skb); iph = ip_hdr(skb); /* iph->version = 4; iph->ihl = 5; */ *(unsigned char *)iph = 0x45; iph->tos = 0; put_unaligned(htons(ip_len), &(iph->tot_len)); iph->id = htons(atomic_inc_return(&ip_ident)); iph->frag_off = 0; iph->ttl = 64; iph->protocol = IPPROTO_UDP; iph->check = 0; put_unaligned(np->local_ip.ip, &(iph->saddr)); put_unaligned(np->remote_ip.ip, &(iph->daddr)); iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); eth = skb_push(skb, ETH_HLEN); skb_reset_mac_header(skb); skb->protocol = eth->h_proto = htons(ETH_P_IP); } ether_addr_copy(eth->h_source, np->dev->dev_addr); ether_addr_copy(eth->h_dest, np->remote_mac); skb->dev = np->dev; netpoll_send_skb(np, skb); } EXPORT_SYMBOL(netpoll_send_udp); void netpoll_print_options(struct netpoll *np) { np_info(np, "local port %d\n", np->local_port); if (np->ipv6) np_info(np, "local IPv6 address %pI6c\n", &np->local_ip.in6); else np_info(np, "local IPv4 address %pI4\n", &np->local_ip.ip); np_info(np, "interface '%s'\n", np->dev_name); np_info(np, "remote port %d\n", np->remote_port); if (np->ipv6) np_info(np, "remote IPv6 address %pI6c\n", &np->remote_ip.in6); else np_info(np, "remote IPv4 address %pI4\n", &np->remote_ip.ip); np_info(np, "remote ethernet address %pM\n", np->remote_mac); } EXPORT_SYMBOL(netpoll_print_options); static int netpoll_parse_ip_addr(const char *str, union inet_addr *addr) { const char *end; if (!strchr(str, ':') && in4_pton(str, -1, (void *)addr, -1, &end) > 0) { if (!*end) return 0; } if (in6_pton(str, -1, addr->in6.s6_addr, -1, &end) > 0) { #if IS_ENABLED(CONFIG_IPV6) if (!*end) return 1; #else return -1; #endif } return -1; } int netpoll_parse_options(struct netpoll *np, char *opt) { char *cur=opt, *delim; int ipv6; bool ipversion_set = false; if (*cur != '@') { if ((delim = strchr(cur, '@')) == NULL) goto parse_failed; *delim = 0; if (kstrtou16(cur, 10, &np->local_port)) goto parse_failed; cur = delim; } cur++; if (*cur != '/') { ipversion_set = true; if ((delim = strchr(cur, '/')) == NULL) goto parse_failed; *delim = 0; ipv6 = netpoll_parse_ip_addr(cur, &np->local_ip); if (ipv6 < 0) goto parse_failed; else np->ipv6 = (bool)ipv6; cur = delim; } cur++; if (*cur != ',') { /* parse out dev name */ if ((delim = strchr(cur, ',')) == NULL) goto parse_failed; *delim = 0; strscpy(np->dev_name, cur, sizeof(np->dev_name)); cur = delim; } cur++; if (*cur != '@') { /* dst port */ if ((delim = strchr(cur, '@')) == NULL) goto parse_failed; *delim = 0; if (*cur == ' ' || *cur == '\t') np_info(np, "warning: whitespace is not allowed\n"); if (kstrtou16(cur, 10, &np->remote_port)) goto parse_failed; cur = delim; } cur++; /* dst ip */ if ((delim = strchr(cur, '/')) == NULL) goto parse_failed; *delim = 0; ipv6 = netpoll_parse_ip_addr(cur, &np->remote_ip); if (ipv6 < 0) goto parse_failed; else if (ipversion_set && np->ipv6 != (bool)ipv6) goto parse_failed; else np->ipv6 = (bool)ipv6; cur = delim + 1; if (*cur != 0) { /* MAC address */ if (!mac_pton(cur, np->remote_mac)) goto parse_failed; } netpoll_print_options(np); return 0; parse_failed: np_info(np, "couldn't parse config at '%s'!\n", cur); return -1; } EXPORT_SYMBOL(netpoll_parse_options); int __netpoll_setup(struct netpoll *np, struct net_device *ndev) { struct netpoll_info *npinfo; const struct net_device_ops *ops; int err; np->dev = ndev; strscpy(np->dev_name, ndev->name, IFNAMSIZ); if (ndev->priv_flags & IFF_DISABLE_NETPOLL) { np_err(np, "%s doesn't support polling, aborting\n", np->dev_name); err = -ENOTSUPP; goto out; } if (!ndev->npinfo) { npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL); if (!npinfo) { err = -ENOMEM; goto out; } sema_init(&npinfo->dev_lock, 1); skb_queue_head_init(&npinfo->txq); INIT_DELAYED_WORK(&npinfo->tx_work, queue_process); refcount_set(&npinfo->refcnt, 1); ops = np->dev->netdev_ops; if (ops->ndo_netpoll_setup) { err = ops->ndo_netpoll_setup(ndev, npinfo); if (err) goto free_npinfo; } } else { npinfo = rtnl_dereference(ndev->npinfo); refcount_inc(&npinfo->refcnt); } npinfo->netpoll = np; /* last thing to do is link it to the net device structure */ rcu_assign_pointer(ndev->npinfo, npinfo); return 0; free_npinfo: kfree(npinfo); out: return err; } EXPORT_SYMBOL_GPL(__netpoll_setup); int netpoll_setup(struct netpoll *np) { struct net_device *ndev = NULL; struct in_device *in_dev; int err; rtnl_lock(); if (np->dev_name[0]) { struct net *net = current->nsproxy->net_ns; ndev = __dev_get_by_name(net, np->dev_name); } if (!ndev) { np_err(np, "%s doesn't exist, aborting\n", np->dev_name); err = -ENODEV; goto unlock; } netdev_hold(ndev, &np->dev_tracker, GFP_KERNEL); if (netdev_master_upper_dev_get(ndev)) { np_err(np, "%s is a slave device, aborting\n", np->dev_name); err = -EBUSY; goto put; } if (!netif_running(ndev)) { unsigned long atmost; np_info(np, "device %s not up yet, forcing it\n", np->dev_name); err = dev_open(ndev, NULL); if (err) { np_err(np, "failed to open %s\n", ndev->name); goto put; } rtnl_unlock(); atmost = jiffies + carrier_timeout * HZ; while (!netif_carrier_ok(ndev)) { if (time_after(jiffies, atmost)) { np_notice(np, "timeout waiting for carrier\n"); break; } msleep(1); } rtnl_lock(); } if (!np->local_ip.ip) { if (!np->ipv6) { const struct in_ifaddr *ifa; in_dev = __in_dev_get_rtnl(ndev); if (!in_dev) goto put_noaddr; ifa = rtnl_dereference(in_dev->ifa_list); if (!ifa) { put_noaddr: np_err(np, "no IP address for %s, aborting\n", np->dev_name); err = -EDESTADDRREQ; goto put; } np->local_ip.ip = ifa->ifa_local; np_info(np, "local IP %pI4\n", &np->local_ip.ip); } else { #if IS_ENABLED(CONFIG_IPV6) struct inet6_dev *idev; err = -EDESTADDRREQ; idev = __in6_dev_get(ndev); if (idev) { struct inet6_ifaddr *ifp; read_lock_bh(&idev->lock); list_for_each_entry(ifp, &idev->addr_list, if_list) { if (!!(ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL) != !!(ipv6_addr_type(&np->remote_ip.in6) & IPV6_ADDR_LINKLOCAL)) continue; np->local_ip.in6 = ifp->addr; err = 0; break; } read_unlock_bh(&idev->lock); } if (err) { np_err(np, "no IPv6 address for %s, aborting\n", np->dev_name); goto put; } else np_info(np, "local IPv6 %pI6c\n", &np->local_ip.in6); #else np_err(np, "IPv6 is not supported %s, aborting\n", np->dev_name); err = -EINVAL; goto put; #endif } } /* fill up the skb queue */ refill_skbs(); err = __netpoll_setup(np, ndev); if (err) goto put; rtnl_unlock(); return 0; put: netdev_put(ndev, &np->dev_tracker); unlock: rtnl_unlock(); return err; } EXPORT_SYMBOL(netpoll_setup); static int __init netpoll_init(void) { skb_queue_head_init(&skb_pool); return 0; } core_initcall(netpoll_init); static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head) { struct netpoll_info *npinfo = container_of(rcu_head, struct netpoll_info, rcu); skb_queue_purge(&npinfo->txq); /* we can't call cancel_delayed_work_sync here, as we are in softirq */ cancel_delayed_work(&npinfo->tx_work); /* clean after last, unfinished work */ __skb_queue_purge(&npinfo->txq); /* now cancel it again */ cancel_delayed_work(&npinfo->tx_work); kfree(npinfo); } void __netpoll_cleanup(struct netpoll *np) { struct netpoll_info *npinfo; npinfo = rtnl_dereference(np->dev->npinfo); if (!npinfo) return; synchronize_srcu(&netpoll_srcu); if (refcount_dec_and_test(&npinfo->refcnt)) { const struct net_device_ops *ops; ops = np->dev->netdev_ops; if (ops->ndo_netpoll_cleanup) ops->ndo_netpoll_cleanup(np->dev); RCU_INIT_POINTER(np->dev->npinfo, NULL); call_rcu(&npinfo->rcu, rcu_cleanup_netpoll_info); } else RCU_INIT_POINTER(np->dev->npinfo, NULL); } EXPORT_SYMBOL_GPL(__netpoll_cleanup); void __netpoll_free(struct netpoll *np) { ASSERT_RTNL(); /* Wait for transmitting packets to finish before freeing. */ synchronize_rcu(); __netpoll_cleanup(np); kfree(np); } EXPORT_SYMBOL_GPL(__netpoll_free); void netpoll_cleanup(struct netpoll *np) { rtnl_lock(); if (!np->dev) goto out; __netpoll_cleanup(np); netdev_put(np->dev, &np->dev_tracker); np->dev = NULL; out: rtnl_unlock(); } EXPORT_SYMBOL(netpoll_cleanup); |
583 583 583 583 583 576 583 583 223 327 374 374 352 219 234 335 333 333 3 218 219 585 585 183 585 325 325 325 325 325 325 130 325 130 325 130 210 210 210 210 208 2 2 2 2 1 325 210 325 325 130 45 158 152 152 158 128 128 260 259 6 6 6 6 6 6 6 614 601 590 11 5 5 4 227 2 2 2 2 219 225 211 120 120 120 120 1 119 240 102 139 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 | // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001 Intel Corp. * Copyright (c) 2001 Nokia, Inc. * Copyright (c) 2001 La Monte H.P. Yarroll * * This file is part of the SCTP kernel implementation * * Initialization/cleanup for SCTP protocol support. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Jon Grimm <jgrimm@us.ibm.com> * Sridhar Samudrala <sri@us.ibm.com> * Daisy Chang <daisyc@us.ibm.com> * Ardelle Fan <ardelle.fan@intel.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/init.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <linux/seq_file.h> #include <linux/memblock.h> #include <linux/highmem.h> #include <linux/slab.h> #include <net/net_namespace.h> #include <net/protocol.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/route.h> #include <net/sctp/sctp.h> #include <net/addrconf.h> #include <net/inet_common.h> #include <net/inet_ecn.h> #include <net/udp_tunnel.h> #define MAX_SCTP_PORT_HASH_ENTRIES (64 * 1024) /* Global data structures. */ struct sctp_globals sctp_globals __read_mostly; struct idr sctp_assocs_id; DEFINE_SPINLOCK(sctp_assocs_id_lock); static struct sctp_pf *sctp_pf_inet6_specific; static struct sctp_pf *sctp_pf_inet_specific; static struct sctp_af *sctp_af_v4_specific; static struct sctp_af *sctp_af_v6_specific; struct kmem_cache *sctp_chunk_cachep __read_mostly; struct kmem_cache *sctp_bucket_cachep __read_mostly; long sysctl_sctp_mem[3]; int sysctl_sctp_rmem[3]; int sysctl_sctp_wmem[3]; /* Private helper to extract ipv4 address and stash them in * the protocol structure. */ static void sctp_v4_copy_addrlist(struct list_head *addrlist, struct net_device *dev) { struct in_device *in_dev; struct in_ifaddr *ifa; struct sctp_sockaddr_entry *addr; rcu_read_lock(); if ((in_dev = __in_dev_get_rcu(dev)) == NULL) { rcu_read_unlock(); return; } in_dev_for_each_ifa_rcu(ifa, in_dev) { /* Add the address to the local list. */ addr = kzalloc(sizeof(*addr), GFP_ATOMIC); if (addr) { addr->a.v4.sin_family = AF_INET; addr->a.v4.sin_addr.s_addr = ifa->ifa_local; addr->valid = 1; INIT_LIST_HEAD(&addr->list); list_add_tail(&addr->list, addrlist); } } rcu_read_unlock(); } /* Extract our IP addresses from the system and stash them in the * protocol structure. */ static void sctp_get_local_addr_list(struct net *net) { struct net_device *dev; struct list_head *pos; struct sctp_af *af; rcu_read_lock(); for_each_netdev_rcu(net, dev) { list_for_each(pos, &sctp_address_families) { af = list_entry(pos, struct sctp_af, list); af->copy_addrlist(&net->sctp.local_addr_list, dev); } } rcu_read_unlock(); } /* Free the existing local addresses. */ static void sctp_free_local_addr_list(struct net *net) { struct sctp_sockaddr_entry *addr; struct list_head *pos, *temp; list_for_each_safe(pos, temp, &net->sctp.local_addr_list) { addr = list_entry(pos, struct sctp_sockaddr_entry, list); list_del(pos); kfree(addr); } } /* Copy the local addresses which are valid for 'scope' into 'bp'. */ int sctp_copy_local_addr_list(struct net *net, struct sctp_bind_addr *bp, enum sctp_scope scope, gfp_t gfp, int copy_flags) { struct sctp_sockaddr_entry *addr; union sctp_addr laddr; int error = 0; rcu_read_lock(); list_for_each_entry_rcu(addr, &net->sctp.local_addr_list, list) { if (!addr->valid) continue; if (!sctp_in_scope(net, &addr->a, scope)) continue; /* Now that the address is in scope, check to see if * the address type is really supported by the local * sock as well as the remote peer. */ if (addr->a.sa.sa_family == AF_INET && (!(copy_flags & SCTP_ADDR4_ALLOWED) || !(copy_flags & SCTP_ADDR4_PEERSUPP))) continue; if (addr->a.sa.sa_family == AF_INET6 && (!(copy_flags & SCTP_ADDR6_ALLOWED) || !(copy_flags & SCTP_ADDR6_PEERSUPP))) continue; laddr = addr->a; /* also works for setting ipv6 address port */ laddr.v4.sin_port = htons(bp->port); if (sctp_bind_addr_state(bp, &laddr) != -1) continue; error = sctp_add_bind_addr(bp, &addr->a, sizeof(addr->a), SCTP_ADDR_SRC, GFP_ATOMIC); if (error) break; } rcu_read_unlock(); return error; } /* Copy over any ip options */ static void sctp_v4_copy_ip_options(struct sock *sk, struct sock *newsk) { struct inet_sock *newinet, *inet = inet_sk(sk); struct ip_options_rcu *inet_opt, *newopt = NULL; newinet = inet_sk(newsk); rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt) { newopt = sock_kmalloc(newsk, sizeof(*inet_opt) + inet_opt->opt.optlen, GFP_ATOMIC); if (newopt) memcpy(newopt, inet_opt, sizeof(*inet_opt) + inet_opt->opt.optlen); else pr_err("%s: Failed to copy ip options\n", __func__); } RCU_INIT_POINTER(newinet->inet_opt, newopt); rcu_read_unlock(); } /* Account for the IP options */ static int sctp_v4_ip_options_len(struct sock *sk) { struct inet_sock *inet = inet_sk(sk); struct ip_options_rcu *inet_opt; int len = 0; rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt) len = inet_opt->opt.optlen; rcu_read_unlock(); return len; } /* Initialize a sctp_addr from in incoming skb. */ static void sctp_v4_from_skb(union sctp_addr *addr, struct sk_buff *skb, int is_saddr) { /* Always called on head skb, so this is safe */ struct sctphdr *sh = sctp_hdr(skb); struct sockaddr_in *sa = &addr->v4; addr->v4.sin_family = AF_INET; if (is_saddr) { sa->sin_port = sh->source; sa->sin_addr.s_addr = ip_hdr(skb)->saddr; } else { sa->sin_port = sh->dest; sa->sin_addr.s_addr = ip_hdr(skb)->daddr; } memset(sa->sin_zero, 0, sizeof(sa->sin_zero)); } /* Initialize an sctp_addr from a socket. */ static void sctp_v4_from_sk(union sctp_addr *addr, struct sock *sk) { addr->v4.sin_family = AF_INET; addr->v4.sin_port = 0; addr->v4.sin_addr.s_addr = inet_sk(sk)->inet_rcv_saddr; memset(addr->v4.sin_zero, 0, sizeof(addr->v4.sin_zero)); } /* Initialize sk->sk_rcv_saddr from sctp_addr. */ static void sctp_v4_to_sk_saddr(union sctp_addr *addr, struct sock *sk) { inet_sk(sk)->inet_rcv_saddr = addr->v4.sin_addr.s_addr; } /* Initialize sk->sk_daddr from sctp_addr. */ static void sctp_v4_to_sk_daddr(union sctp_addr *addr, struct sock *sk) { inet_sk(sk)->inet_daddr = addr->v4.sin_addr.s_addr; } /* Initialize a sctp_addr from an address parameter. */ static bool sctp_v4_from_addr_param(union sctp_addr *addr, union sctp_addr_param *param, __be16 port, int iif) { if (ntohs(param->v4.param_hdr.length) < sizeof(struct sctp_ipv4addr_param)) return false; addr->v4.sin_family = AF_INET; addr->v4.sin_port = port; addr->v4.sin_addr.s_addr = param->v4.addr.s_addr; memset(addr->v4.sin_zero, 0, sizeof(addr->v4.sin_zero)); return true; } /* Initialize an address parameter from a sctp_addr and return the length * of the address parameter. */ static int sctp_v4_to_addr_param(const union sctp_addr *addr, union sctp_addr_param *param) { int length = sizeof(struct sctp_ipv4addr_param); param->v4.param_hdr.type = SCTP_PARAM_IPV4_ADDRESS; param->v4.param_hdr.length = htons(length); param->v4.addr.s_addr = addr->v4.sin_addr.s_addr; return length; } /* Initialize a sctp_addr from a dst_entry. */ static void sctp_v4_dst_saddr(union sctp_addr *saddr, struct flowi4 *fl4, __be16 port) { saddr->v4.sin_family = AF_INET; saddr->v4.sin_port = port; saddr->v4.sin_addr.s_addr = fl4->saddr; memset(saddr->v4.sin_zero, 0, sizeof(saddr->v4.sin_zero)); } /* Compare two addresses exactly. */ static int sctp_v4_cmp_addr(const union sctp_addr *addr1, const union sctp_addr *addr2) { if (addr1->sa.sa_family != addr2->sa.sa_family) return 0; if (addr1->v4.sin_port != addr2->v4.sin_port) return 0; if (addr1->v4.sin_addr.s_addr != addr2->v4.sin_addr.s_addr) return 0; return 1; } /* Initialize addr struct to INADDR_ANY. */ static void sctp_v4_inaddr_any(union sctp_addr *addr, __be16 port) { addr->v4.sin_family = AF_INET; addr->v4.sin_addr.s_addr = htonl(INADDR_ANY); addr->v4.sin_port = port; memset(addr->v4.sin_zero, 0, sizeof(addr->v4.sin_zero)); } /* Is this a wildcard address? */ static int sctp_v4_is_any(const union sctp_addr *addr) { return htonl(INADDR_ANY) == addr->v4.sin_addr.s_addr; } /* This function checks if the address is a valid address to be used for * SCTP binding. * * Output: * Return 0 - If the address is a non-unicast or an illegal address. * Return 1 - If the address is a unicast. */ static int sctp_v4_addr_valid(union sctp_addr *addr, struct sctp_sock *sp, const struct sk_buff *skb) { /* IPv4 addresses not allowed */ if (sp && ipv6_only_sock(sctp_opt2sk(sp))) return 0; /* Is this a non-unicast address or a unusable SCTP address? */ if (IS_IPV4_UNUSABLE_ADDRESS(addr->v4.sin_addr.s_addr)) return 0; /* Is this a broadcast address? */ if (skb && skb_rtable(skb)->rt_flags & RTCF_BROADCAST) return 0; return 1; } /* Should this be available for binding? */ static int sctp_v4_available(union sctp_addr *addr, struct sctp_sock *sp) { struct sock *sk = &sp->inet.sk; struct net *net = sock_net(sk); int tb_id = RT_TABLE_LOCAL; int ret; tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ?: tb_id; ret = inet_addr_type_table(net, addr->v4.sin_addr.s_addr, tb_id); if (addr->v4.sin_addr.s_addr != htonl(INADDR_ANY) && ret != RTN_LOCAL && !inet_test_bit(FREEBIND, sk) && !READ_ONCE(net->ipv4.sysctl_ip_nonlocal_bind)) return 0; if (ipv6_only_sock(sctp_opt2sk(sp))) return 0; return 1; } /* Checking the loopback, private and other address scopes as defined in * RFC 1918. The IPv4 scoping is based on the draft for SCTP IPv4 * scoping <draft-stewart-tsvwg-sctp-ipv4-00.txt>. * * Level 0 - unusable SCTP addresses * Level 1 - loopback address * Level 2 - link-local addresses * Level 3 - private addresses. * Level 4 - global addresses * For INIT and INIT-ACK address list, let L be the level of * requested destination address, sender and receiver * SHOULD include all of its addresses with level greater * than or equal to L. * * IPv4 scoping can be controlled through sysctl option * net.sctp.addr_scope_policy */ static enum sctp_scope sctp_v4_scope(union sctp_addr *addr) { enum sctp_scope retval; /* Check for unusable SCTP addresses. */ if (IS_IPV4_UNUSABLE_ADDRESS(addr->v4.sin_addr.s_addr)) { retval = SCTP_SCOPE_UNUSABLE; } else if (ipv4_is_loopback(addr->v4.sin_addr.s_addr)) { retval = SCTP_SCOPE_LOOPBACK; } else if (ipv4_is_linklocal_169(addr->v4.sin_addr.s_addr)) { retval = SCTP_SCOPE_LINK; } else if (ipv4_is_private_10(addr->v4.sin_addr.s_addr) || ipv4_is_private_172(addr->v4.sin_addr.s_addr) || ipv4_is_private_192(addr->v4.sin_addr.s_addr) || ipv4_is_test_198(addr->v4.sin_addr.s_addr)) { retval = SCTP_SCOPE_PRIVATE; } else { retval = SCTP_SCOPE_GLOBAL; } return retval; } /* Returns a valid dst cache entry for the given source and destination ip * addresses. If an association is passed, trys to get a dst entry with a * source address that matches an address in the bind address list. */ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, struct flowi *fl, struct sock *sk) { struct sctp_association *asoc = t->asoc; struct rtable *rt; struct flowi _fl; struct flowi4 *fl4 = &_fl.u.ip4; struct sctp_bind_addr *bp; struct sctp_sockaddr_entry *laddr; struct dst_entry *dst = NULL; union sctp_addr *daddr = &t->ipaddr; union sctp_addr dst_saddr; __u8 tos = inet_sk(sk)->tos; if (t->dscp & SCTP_DSCP_SET_MASK) tos = t->dscp & SCTP_DSCP_VAL_MASK; memset(&_fl, 0x0, sizeof(_fl)); fl4->daddr = daddr->v4.sin_addr.s_addr; fl4->fl4_dport = daddr->v4.sin_port; fl4->flowi4_proto = IPPROTO_SCTP; if (asoc) { fl4->flowi4_tos = RT_TOS(tos); fl4->flowi4_scope = ip_sock_rt_scope(asoc->base.sk); fl4->flowi4_oif = asoc->base.sk->sk_bound_dev_if; fl4->fl4_sport = htons(asoc->base.bind_addr.port); } if (saddr) { fl4->saddr = saddr->v4.sin_addr.s_addr; if (!fl4->fl4_sport) fl4->fl4_sport = saddr->v4.sin_port; } pr_debug("%s: dst:%pI4, src:%pI4 - ", __func__, &fl4->daddr, &fl4->saddr); rt = ip_route_output_key(sock_net(sk), fl4); if (!IS_ERR(rt)) { dst = &rt->dst; t->dst = dst; memcpy(fl, &_fl, sizeof(_fl)); } /* If there is no association or if a source address is passed, no * more validation is required. */ if (!asoc || saddr) goto out; bp = &asoc->base.bind_addr; if (dst) { /* Walk through the bind address list and look for a bind * address that matches the source address of the returned dst. */ sctp_v4_dst_saddr(&dst_saddr, fl4, htons(bp->port)); rcu_read_lock(); list_for_each_entry_rcu(laddr, &bp->address_list, list) { if (!laddr->valid || (laddr->state == SCTP_ADDR_DEL) || (laddr->state != SCTP_ADDR_SRC && !asoc->src_out_of_asoc_ok)) continue; if (sctp_v4_cmp_addr(&dst_saddr, &laddr->a)) goto out_unlock; } rcu_read_unlock(); /* None of the bound addresses match the source address of the * dst. So release it. */ dst_release(dst); dst = NULL; } /* Walk through the bind address list and try to get a dst that * matches a bind address as the source address. */ rcu_read_lock(); list_for_each_entry_rcu(laddr, &bp->address_list, list) { struct net_device *odev; if (!laddr->valid) continue; if (laddr->state != SCTP_ADDR_SRC || AF_INET != laddr->a.sa.sa_family) continue; fl4->fl4_sport = laddr->a.v4.sin_port; flowi4_update_output(fl4, asoc->base.sk->sk_bound_dev_if, daddr->v4.sin_addr.s_addr, laddr->a.v4.sin_addr.s_addr); rt = ip_route_output_key(sock_net(sk), fl4); if (IS_ERR(rt)) continue; /* Ensure the src address belongs to the output * interface. */ odev = __ip_dev_find(sock_net(sk), laddr->a.v4.sin_addr.s_addr, false); if (!odev || odev->ifindex != fl4->flowi4_oif) { if (!dst) { dst = &rt->dst; t->dst = dst; memcpy(fl, &_fl, sizeof(_fl)); } else { dst_release(&rt->dst); } continue; } dst_release(dst); dst = &rt->dst; t->dst = dst; memcpy(fl, &_fl, sizeof(_fl)); break; } out_unlock: rcu_read_unlock(); out: if (dst) { pr_debug("rt_dst:%pI4, rt_src:%pI4\n", &fl->u.ip4.daddr, &fl->u.ip4.saddr); } else { t->dst = NULL; pr_debug("no route\n"); } } /* For v4, the source address is cached in the route entry(dst). So no need * to cache it separately and hence this is an empty routine. */ static void sctp_v4_get_saddr(struct sctp_sock *sk, struct sctp_transport *t, struct flowi *fl) { union sctp_addr *saddr = &t->saddr; struct rtable *rt = (struct rtable *)t->dst; if (rt) { saddr->v4.sin_family = AF_INET; saddr->v4.sin_addr.s_addr = fl->u.ip4.saddr; } } /* What interface did this skb arrive on? */ static int sctp_v4_skb_iif(const struct sk_buff *skb) { return inet_iif(skb); } static int sctp_v4_skb_sdif(const struct sk_buff *skb) { return inet_sdif(skb); } /* Was this packet marked by Explicit Congestion Notification? */ static int sctp_v4_is_ce(const struct sk_buff *skb) { return INET_ECN_is_ce(ip_hdr(skb)->tos); } /* Create and initialize a new sk for the socket returned by accept(). */ static struct sock *sctp_v4_create_accept_sk(struct sock *sk, struct sctp_association *asoc, bool kern) { struct sock *newsk = sk_alloc(sock_net(sk), PF_INET, GFP_KERNEL, sk->sk_prot, kern); struct inet_sock *newinet; if (!newsk) goto out; sock_init_data(NULL, newsk); sctp_copy_sock(newsk, sk, asoc); sock_reset_flag(newsk, SOCK_ZAPPED); sctp_v4_copy_ip_options(sk, newsk); newinet = inet_sk(newsk); newinet->inet_daddr = asoc->peer.primary_addr.v4.sin_addr.s_addr; if (newsk->sk_prot->init(newsk)) { sk_common_release(newsk); newsk = NULL; } out: return newsk; } static int sctp_v4_addr_to_user(struct sctp_sock *sp, union sctp_addr *addr) { /* No address mapping for V4 sockets */ memset(addr->v4.sin_zero, 0, sizeof(addr->v4.sin_zero)); return sizeof(struct sockaddr_in); } /* Dump the v4 addr to the seq file. */ static void sctp_v4_seq_dump_addr(struct seq_file *seq, union sctp_addr *addr) { seq_printf(seq, "%pI4 ", &addr->v4.sin_addr); } static void sctp_v4_ecn_capable(struct sock *sk) { INET_ECN_xmit(sk); } static void sctp_addr_wq_timeout_handler(struct timer_list *t) { struct net *net = from_timer(net, t, sctp.addr_wq_timer); struct sctp_sockaddr_entry *addrw, *temp; struct sctp_sock *sp; spin_lock_bh(&net->sctp.addr_wq_lock); list_for_each_entry_safe(addrw, temp, &net->sctp.addr_waitq, list) { pr_debug("%s: the first ent in wq:%p is addr:%pISc for cmd:%d at " "entry:%p\n", __func__, &net->sctp.addr_waitq, &addrw->a.sa, addrw->state, addrw); #if IS_ENABLED(CONFIG_IPV6) /* Now we send an ASCONF for each association */ /* Note. we currently don't handle link local IPv6 addressees */ if (addrw->a.sa.sa_family == AF_INET6) { struct in6_addr *in6; if (ipv6_addr_type(&addrw->a.v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) goto free_next; in6 = (struct in6_addr *)&addrw->a.v6.sin6_addr; if (ipv6_chk_addr(net, in6, NULL, 0) == 0 && addrw->state == SCTP_ADDR_NEW) { unsigned long timeo_val; pr_debug("%s: this is on DAD, trying %d sec " "later\n", __func__, SCTP_ADDRESS_TICK_DELAY); timeo_val = jiffies; timeo_val += msecs_to_jiffies(SCTP_ADDRESS_TICK_DELAY); mod_timer(&net->sctp.addr_wq_timer, timeo_val); break; } } #endif list_for_each_entry(sp, &net->sctp.auto_asconf_splist, auto_asconf_list) { struct sock *sk; sk = sctp_opt2sk(sp); /* ignore bound-specific endpoints */ if (!sctp_is_ep_boundall(sk)) continue; bh_lock_sock(sk); if (sctp_asconf_mgmt(sp, addrw) < 0) pr_debug("%s: sctp_asconf_mgmt failed\n", __func__); bh_unlock_sock(sk); } #if IS_ENABLED(CONFIG_IPV6) free_next: #endif list_del(&addrw->list); kfree(addrw); } spin_unlock_bh(&net->sctp.addr_wq_lock); } static void sctp_free_addr_wq(struct net *net) { struct sctp_sockaddr_entry *addrw; struct sctp_sockaddr_entry *temp; spin_lock_bh(&net->sctp.addr_wq_lock); del_timer(&net->sctp.addr_wq_timer); list_for_each_entry_safe(addrw, temp, &net->sctp.addr_waitq, list) { list_del(&addrw->list); kfree(addrw); } spin_unlock_bh(&net->sctp.addr_wq_lock); } /* lookup the entry for the same address in the addr_waitq * sctp_addr_wq MUST be locked */ static struct sctp_sockaddr_entry *sctp_addr_wq_lookup(struct net *net, struct sctp_sockaddr_entry *addr) { struct sctp_sockaddr_entry *addrw; list_for_each_entry(addrw, &net->sctp.addr_waitq, list) { if (addrw->a.sa.sa_family != addr->a.sa.sa_family) continue; if (addrw->a.sa.sa_family == AF_INET) { if (addrw->a.v4.sin_addr.s_addr == addr->a.v4.sin_addr.s_addr) return addrw; } else if (addrw->a.sa.sa_family == AF_INET6) { if (ipv6_addr_equal(&addrw->a.v6.sin6_addr, &addr->a.v6.sin6_addr)) return addrw; } } return NULL; } void sctp_addr_wq_mgmt(struct net *net, struct sctp_sockaddr_entry *addr, int cmd) { struct sctp_sockaddr_entry *addrw; unsigned long timeo_val; /* first, we check if an opposite message already exist in the queue. * If we found such message, it is removed. * This operation is a bit stupid, but the DHCP client attaches the * new address after a couple of addition and deletion of that address */ spin_lock_bh(&net->sctp.addr_wq_lock); /* Offsets existing events in addr_wq */ addrw = sctp_addr_wq_lookup(net, addr); if (addrw) { if (addrw->state != cmd) { pr_debug("%s: offsets existing entry for %d, addr:%pISc " "in wq:%p\n", __func__, addrw->state, &addrw->a.sa, &net->sctp.addr_waitq); list_del(&addrw->list); kfree(addrw); } spin_unlock_bh(&net->sctp.addr_wq_lock); return; } /* OK, we have to add the new address to the wait queue */ addrw = kmemdup(addr, sizeof(struct sctp_sockaddr_entry), GFP_ATOMIC); if (addrw == NULL) { spin_unlock_bh(&net->sctp.addr_wq_lock); return; } addrw->state = cmd; list_add_tail(&addrw->list, &net->sctp.addr_waitq); pr_debug("%s: add new entry for cmd:%d, addr:%pISc in wq:%p\n", __func__, addrw->state, &addrw->a.sa, &net->sctp.addr_waitq); if (!timer_pending(&net->sctp.addr_wq_timer)) { timeo_val = jiffies; timeo_val += msecs_to_jiffies(SCTP_ADDRESS_TICK_DELAY); mod_timer(&net->sctp.addr_wq_timer, timeo_val); } spin_unlock_bh(&net->sctp.addr_wq_lock); } /* Event handler for inet address addition/deletion events. * The sctp_local_addr_list needs to be protocted by a spin lock since * multiple notifiers (say IPv4 and IPv6) may be running at the same * time and thus corrupt the list. * The reader side is protected with RCU. */ static int sctp_inetaddr_event(struct notifier_block *this, unsigned long ev, void *ptr) { struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; struct sctp_sockaddr_entry *addr = NULL; struct sctp_sockaddr_entry *temp; struct net *net = dev_net(ifa->ifa_dev->dev); int found = 0; switch (ev) { case NETDEV_UP: addr = kzalloc(sizeof(*addr), GFP_ATOMIC); if (addr) { addr->a.v4.sin_family = AF_INET; addr->a.v4.sin_addr.s_addr = ifa->ifa_local; addr->valid = 1; spin_lock_bh(&net->sctp.local_addr_lock); list_add_tail_rcu(&addr->list, &net->sctp.local_addr_list); sctp_addr_wq_mgmt(net, addr, SCTP_ADDR_NEW); spin_unlock_bh(&net->sctp.local_addr_lock); } break; case NETDEV_DOWN: spin_lock_bh(&net->sctp.local_addr_lock); list_for_each_entry_safe(addr, temp, &net->sctp.local_addr_list, list) { if (addr->a.sa.sa_family == AF_INET && addr->a.v4.sin_addr.s_addr == ifa->ifa_local) { sctp_addr_wq_mgmt(net, addr, SCTP_ADDR_DEL); found = 1; addr->valid = 0; list_del_rcu(&addr->list); break; } } spin_unlock_bh(&net->sctp.local_addr_lock); if (found) kfree_rcu(addr, rcu); break; } return NOTIFY_DONE; } /* * Initialize the control inode/socket with a control endpoint data * structure. This endpoint is reserved exclusively for the OOTB processing. */ static int sctp_ctl_sock_init(struct net *net) { int err; sa_family_t family = PF_INET; if (sctp_get_pf_specific(PF_INET6)) family = PF_INET6; err = inet_ctl_sock_create(&net->sctp.ctl_sock, family, SOCK_SEQPACKET, IPPROTO_SCTP, net); /* If IPv6 socket could not be created, try the IPv4 socket */ if (err < 0 && family == PF_INET6) err = inet_ctl_sock_create(&net->sctp.ctl_sock, AF_INET, SOCK_SEQPACKET, IPPROTO_SCTP, net); if (err < 0) { pr_err("Failed to create the SCTP control socket\n"); return err; } return 0; } static int sctp_udp_rcv(struct sock *sk, struct sk_buff *skb) { SCTP_INPUT_CB(skb)->encap_port = udp_hdr(skb)->source; skb_set_transport_header(skb, sizeof(struct udphdr)); sctp_rcv(skb); return 0; } int sctp_udp_sock_start(struct net *net) { struct udp_tunnel_sock_cfg tuncfg = {NULL}; struct udp_port_cfg udp_conf = {0}; struct socket *sock; int err; udp_conf.family = AF_INET; udp_conf.local_ip.s_addr = htonl(INADDR_ANY); udp_conf.local_udp_port = htons(net->sctp.udp_port); err = udp_sock_create(net, &udp_conf, &sock); if (err) { pr_err("Failed to create the SCTP UDP tunneling v4 sock\n"); return err; } tuncfg.encap_type = 1; tuncfg.encap_rcv = sctp_udp_rcv; tuncfg.encap_err_lookup = sctp_udp_v4_err; setup_udp_tunnel_sock(net, sock, &tuncfg); net->sctp.udp4_sock = sock->sk; #if IS_ENABLED(CONFIG_IPV6) memset(&udp_conf, 0, sizeof(udp_conf)); udp_conf.family = AF_INET6; udp_conf.local_ip6 = in6addr_any; udp_conf.local_udp_port = htons(net->sctp.udp_port); udp_conf.use_udp6_rx_checksums = true; udp_conf.ipv6_v6only = true; err = udp_sock_create(net, &udp_conf, &sock); if (err) { pr_err("Failed to create the SCTP UDP tunneling v6 sock\n"); udp_tunnel_sock_release(net->sctp.udp4_sock->sk_socket); net->sctp.udp4_sock = NULL; return err; } tuncfg.encap_type = 1; tuncfg.encap_rcv = sctp_udp_rcv; tuncfg.encap_err_lookup = sctp_udp_v6_err; setup_udp_tunnel_sock(net, sock, &tuncfg); net->sctp.udp6_sock = sock->sk; #endif return 0; } void sctp_udp_sock_stop(struct net *net) { if (net->sctp.udp4_sock) { udp_tunnel_sock_release(net->sctp.udp4_sock->sk_socket); net->sctp.udp4_sock = NULL; } if (net->sctp.udp6_sock) { udp_tunnel_sock_release(net->sctp.udp6_sock->sk_socket); net->sctp.udp6_sock = NULL; } } /* Register address family specific functions. */ int sctp_register_af(struct sctp_af *af) { switch (af->sa_family) { case AF_INET: if (sctp_af_v4_specific) return 0; sctp_af_v4_specific = af; break; case AF_INET6: if (sctp_af_v6_specific) return 0; sctp_af_v6_specific = af; break; default: return 0; } INIT_LIST_HEAD(&af->list); list_add_tail(&af->list, &sctp_address_families); return 1; } /* Get the table of functions for manipulating a particular address * family. */ struct sctp_af *sctp_get_af_specific(sa_family_t family) { switch (family) { case AF_INET: return sctp_af_v4_specific; case AF_INET6: return sctp_af_v6_specific; default: return NULL; } } /* Common code to initialize a AF_INET msg_name. */ static void sctp_inet_msgname(char *msgname, int *addr_len) { struct sockaddr_in *sin; sin = (struct sockaddr_in *)msgname; *addr_len = sizeof(struct sockaddr_in); sin->sin_family = AF_INET; memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); } /* Copy the primary address of the peer primary address as the msg_name. */ static void sctp_inet_event_msgname(struct sctp_ulpevent *event, char *msgname, int *addr_len) { struct sockaddr_in *sin, *sinfrom; if (msgname) { struct sctp_association *asoc; asoc = event->asoc; sctp_inet_msgname(msgname, addr_len); sin = (struct sockaddr_in *)msgname; sinfrom = &asoc->peer.primary_addr.v4; sin->sin_port = htons(asoc->peer.port); sin->sin_addr.s_addr = sinfrom->sin_addr.s_addr; } } /* Initialize and copy out a msgname from an inbound skb. */ static void sctp_inet_skb_msgname(struct sk_buff *skb, char *msgname, int *len) { if (msgname) { struct sctphdr *sh = sctp_hdr(skb); struct sockaddr_in *sin = (struct sockaddr_in *)msgname; sctp_inet_msgname(msgname, len); sin->sin_port = sh->source; sin->sin_addr.s_addr = ip_hdr(skb)->saddr; } } /* Do we support this AF? */ static int sctp_inet_af_supported(sa_family_t family, struct sctp_sock *sp) { /* PF_INET only supports AF_INET addresses. */ return AF_INET == family; } /* Address matching with wildcards allowed. */ static int sctp_inet_cmp_addr(const union sctp_addr *addr1, const union sctp_addr *addr2, struct sctp_sock *opt) { /* PF_INET only supports AF_INET addresses. */ if (addr1->sa.sa_family != addr2->sa.sa_family) return 0; if (htonl(INADDR_ANY) == addr1->v4.sin_addr.s_addr || htonl(INADDR_ANY) == addr2->v4.sin_addr.s_addr) return 1; if (addr1->v4.sin_addr.s_addr == addr2->v4.sin_addr.s_addr) return 1; return 0; } /* Verify that provided sockaddr looks bindable. Common verification has * already been taken care of. */ static int sctp_inet_bind_verify(struct sctp_sock *opt, union sctp_addr *addr) { return sctp_v4_available(addr, opt); } /* Verify that sockaddr looks sendable. Common verification has already * been taken care of. */ static int sctp_inet_send_verify(struct sctp_sock *opt, union sctp_addr *addr) { return 1; } /* Fill in Supported Address Type information for INIT and INIT-ACK * chunks. Returns number of addresses supported. */ static int sctp_inet_supported_addrs(const struct sctp_sock *opt, __be16 *types) { types[0] = SCTP_PARAM_IPV4_ADDRESS; return 1; } /* Wrapper routine that calls the ip transmit routine. */ static inline int sctp_v4_xmit(struct sk_buff *skb, struct sctp_transport *t) { struct dst_entry *dst = dst_clone(t->dst); struct flowi4 *fl4 = &t->fl.u.ip4; struct sock *sk = skb->sk; struct inet_sock *inet = inet_sk(sk); __u8 dscp = inet->tos; __be16 df = 0; pr_debug("%s: skb:%p, len:%d, src:%pI4, dst:%pI4\n", __func__, skb, skb->len, &fl4->saddr, &fl4->daddr); if (t->dscp & SCTP_DSCP_SET_MASK) dscp = t->dscp & SCTP_DSCP_VAL_MASK; inet->pmtudisc = t->param_flags & SPP_PMTUD_ENABLE ? IP_PMTUDISC_DO : IP_PMTUDISC_DONT; SCTP_INC_STATS(sock_net(sk), SCTP_MIB_OUTSCTPPACKS); if (!t->encap_port || !sctp_sk(sk)->udp_port) { skb_dst_set(skb, dst); return __ip_queue_xmit(sk, skb, &t->fl, dscp); } if (skb_is_gso(skb)) skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; if (ip_dont_fragment(sk, dst) && !skb->ignore_df) df = htons(IP_DF); skb->encapsulation = 1; skb_reset_inner_mac_header(skb); skb_reset_inner_transport_header(skb); skb_set_inner_ipproto(skb, IPPROTO_SCTP); udp_tunnel_xmit_skb((struct rtable *)dst, sk, skb, fl4->saddr, fl4->daddr, dscp, ip4_dst_hoplimit(dst), df, sctp_sk(sk)->udp_port, t->encap_port, false, false); return 0; } static struct sctp_af sctp_af_inet; static struct sctp_pf sctp_pf_inet = { .event_msgname = sctp_inet_event_msgname, .skb_msgname = sctp_inet_skb_msgname, .af_supported = sctp_inet_af_supported, .cmp_addr = sctp_inet_cmp_addr, .bind_verify = sctp_inet_bind_verify, .send_verify = sctp_inet_send_verify, .supported_addrs = sctp_inet_supported_addrs, .create_accept_sk = sctp_v4_create_accept_sk, .addr_to_user = sctp_v4_addr_to_user, .to_sk_saddr = sctp_v4_to_sk_saddr, .to_sk_daddr = sctp_v4_to_sk_daddr, .copy_ip_options = sctp_v4_copy_ip_options, .af = &sctp_af_inet }; /* Notifier for inetaddr addition/deletion events. */ static struct notifier_block sctp_inetaddr_notifier = { .notifier_call = sctp_inetaddr_event, }; /* Socket operations. */ static const struct proto_ops inet_seqpacket_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, /* Needs to be wrapped... */ .bind = inet_bind, .connect = sctp_inet_connect, .socketpair = sock_no_socketpair, .accept = inet_accept, .getname = inet_getname, /* Semantics are different. */ .poll = sctp_poll, .ioctl = inet_ioctl, .gettstamp = sock_gettstamp, .listen = sctp_inet_listen, .shutdown = inet_shutdown, /* Looks harmless. */ .setsockopt = sock_common_setsockopt, /* IP_SOL IP_OPTION is a problem */ .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, }; /* Registration with AF_INET family. */ static struct inet_protosw sctp_seqpacket_protosw = { .type = SOCK_SEQPACKET, .protocol = IPPROTO_SCTP, .prot = &sctp_prot, .ops = &inet_seqpacket_ops, .flags = SCTP_PROTOSW_FLAG }; static struct inet_protosw sctp_stream_protosw = { .type = SOCK_STREAM, .protocol = IPPROTO_SCTP, .prot = &sctp_prot, .ops = &inet_seqpacket_ops, .flags = SCTP_PROTOSW_FLAG }; static int sctp4_rcv(struct sk_buff *skb) { SCTP_INPUT_CB(skb)->encap_port = 0; return sctp_rcv(skb); } /* Register with IP layer. */ static const struct net_protocol sctp_protocol = { .handler = sctp4_rcv, .err_handler = sctp_v4_err, .no_policy = 1, .icmp_strict_tag_validation = 1, }; /* IPv4 address related functions. */ static struct sctp_af sctp_af_inet = { .sa_family = AF_INET, .sctp_xmit = sctp_v4_xmit, .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, .get_dst = sctp_v4_get_dst, .get_saddr = sctp_v4_get_saddr, .copy_addrlist = sctp_v4_copy_addrlist, .from_skb = sctp_v4_from_skb, .from_sk = sctp_v4_from_sk, .from_addr_param = sctp_v4_from_addr_param, .to_addr_param = sctp_v4_to_addr_param, .cmp_addr = sctp_v4_cmp_addr, .addr_valid = sctp_v4_addr_valid, .inaddr_any = sctp_v4_inaddr_any, .is_any = sctp_v4_is_any, .available = sctp_v4_available, .scope = sctp_v4_scope, .skb_iif = sctp_v4_skb_iif, .skb_sdif = sctp_v4_skb_sdif, .is_ce = sctp_v4_is_ce, .seq_dump_addr = sctp_v4_seq_dump_addr, .ecn_capable = sctp_v4_ecn_capable, .net_header_len = sizeof(struct iphdr), .sockaddr_len = sizeof(struct sockaddr_in), .ip_options_len = sctp_v4_ip_options_len, }; struct sctp_pf *sctp_get_pf_specific(sa_family_t family) { switch (family) { case PF_INET: return sctp_pf_inet_specific; case PF_INET6: return sctp_pf_inet6_specific; default: return NULL; } } /* Register the PF specific function table. */ int sctp_register_pf(struct sctp_pf *pf, sa_family_t family) { switch (family) { case PF_INET: if (sctp_pf_inet_specific) return 0; sctp_pf_inet_specific = pf; break; case PF_INET6: if (sctp_pf_inet6_specific) return 0; sctp_pf_inet6_specific = pf; break; default: return 0; } return 1; } static inline int init_sctp_mibs(struct net *net) { net->sctp.sctp_statistics = alloc_percpu(struct sctp_mib); if (!net->sctp.sctp_statistics) return -ENOMEM; return 0; } static inline void cleanup_sctp_mibs(struct net *net) { free_percpu(net->sctp.sctp_statistics); } static void sctp_v4_pf_init(void) { /* Initialize the SCTP specific PF functions. */ sctp_register_pf(&sctp_pf_inet, PF_INET); sctp_register_af(&sctp_af_inet); } static void sctp_v4_pf_exit(void) { list_del(&sctp_af_inet.list); } static int sctp_v4_protosw_init(void) { int rc; rc = proto_register(&sctp_prot, 1); if (rc) return rc; /* Register SCTP(UDP and TCP style) with socket layer. */ inet_register_protosw(&sctp_seqpacket_protosw); inet_register_protosw(&sctp_stream_protosw); return 0; } static void sctp_v4_protosw_exit(void) { inet_unregister_protosw(&sctp_stream_protosw); inet_unregister_protosw(&sctp_seqpacket_protosw); proto_unregister(&sctp_prot); } static int sctp_v4_add_protocol(void) { /* Register notifier for inet address additions/deletions. */ register_inetaddr_notifier(&sctp_inetaddr_notifier); /* Register SCTP with inet layer. */ if (inet_add_protocol(&sctp_protocol, IPPROTO_SCTP) < 0) return -EAGAIN; return 0; } static void sctp_v4_del_protocol(void) { inet_del_protocol(&sctp_protocol, IPPROTO_SCTP); unregister_inetaddr_notifier(&sctp_inetaddr_notifier); } static int __net_init sctp_defaults_init(struct net *net) { int status; /* * 14. Suggested SCTP Protocol Parameter Values */ /* The following protocol parameters are RECOMMENDED: */ /* RTO.Initial - 3 seconds */ net->sctp.rto_initial = SCTP_RTO_INITIAL; /* RTO.Min - 1 second */ net->sctp.rto_min = SCTP_RTO_MIN; /* RTO.Max - 60 seconds */ net->sctp.rto_max = SCTP_RTO_MAX; /* RTO.Alpha - 1/8 */ net->sctp.rto_alpha = SCTP_RTO_ALPHA; /* RTO.Beta - 1/4 */ net->sctp.rto_beta = SCTP_RTO_BETA; /* Valid.Cookie.Life - 60 seconds */ net->sctp.valid_cookie_life = SCTP_DEFAULT_COOKIE_LIFE; /* Whether Cookie Preservative is enabled(1) or not(0) */ net->sctp.cookie_preserve_enable = 1; /* Default sctp sockets to use md5 as their hmac alg */ #if defined (CONFIG_SCTP_DEFAULT_COOKIE_HMAC_MD5) net->sctp.sctp_hmac_alg = "md5"; #elif defined (CONFIG_SCTP_DEFAULT_COOKIE_HMAC_SHA1) net->sctp.sctp_hmac_alg = "sha1"; #else net->sctp.sctp_hmac_alg = NULL; #endif /* Max.Burst - 4 */ net->sctp.max_burst = SCTP_DEFAULT_MAX_BURST; /* Disable of Primary Path Switchover by default */ net->sctp.ps_retrans = SCTP_PS_RETRANS_MAX; /* Enable pf state by default */ net->sctp.pf_enable = 1; /* Ignore pf exposure feature by default */ net->sctp.pf_expose = SCTP_PF_EXPOSE_UNSET; /* Association.Max.Retrans - 10 attempts * Path.Max.Retrans - 5 attempts (per destination address) * Max.Init.Retransmits - 8 attempts */ net->sctp.max_retrans_association = 10; net->sctp.max_retrans_path = 5; net->sctp.max_retrans_init = 8; /* Sendbuffer growth - do per-socket accounting */ net->sctp.sndbuf_policy = 0; /* Rcvbuffer growth - do per-socket accounting */ net->sctp.rcvbuf_policy = 0; /* HB.interval - 30 seconds */ net->sctp.hb_interval = SCTP_DEFAULT_TIMEOUT_HEARTBEAT; /* delayed SACK timeout */ net->sctp.sack_timeout = SCTP_DEFAULT_TIMEOUT_SACK; /* Disable ADDIP by default. */ net->sctp.addip_enable = 0; net->sctp.addip_noauth = 0; net->sctp.default_auto_asconf = 0; /* Enable PR-SCTP by default. */ net->sctp.prsctp_enable = 1; /* Disable RECONF by default. */ net->sctp.reconf_enable = 0; /* Disable AUTH by default. */ net->sctp.auth_enable = 0; /* Enable ECN by default. */ net->sctp.ecn_enable = 1; /* Set UDP tunneling listening port to 0 by default */ net->sctp.udp_port = 0; /* Set remote encap port to 0 by default */ net->sctp.encap_port = 0; /* Set SCOPE policy to enabled */ net->sctp.scope_policy = SCTP_SCOPE_POLICY_ENABLE; /* Set the default rwnd update threshold */ net->sctp.rwnd_upd_shift = SCTP_DEFAULT_RWND_SHIFT; /* Initialize maximum autoclose timeout. */ net->sctp.max_autoclose = INT_MAX / HZ; #ifdef CONFIG_NET_L3_MASTER_DEV net->sctp.l3mdev_accept = 1; #endif status = sctp_sysctl_net_register(net); if (status) goto err_sysctl_register; /* Allocate and initialise sctp mibs. */ status = init_sctp_mibs(net); if (status) goto err_init_mibs; #ifdef CONFIG_PROC_FS /* Initialize proc fs directory. */ status = sctp_proc_init(net); if (status) goto err_init_proc; #endif sctp_dbg_objcnt_init(net); /* Initialize the local address list. */ INIT_LIST_HEAD(&net->sctp.local_addr_list); spin_lock_init(&net->sctp.local_addr_lock); sctp_get_local_addr_list(net); /* Initialize the address event list */ INIT_LIST_HEAD(&net->sctp.addr_waitq); INIT_LIST_HEAD(&net->sctp.auto_asconf_splist); spin_lock_init(&net->sctp.addr_wq_lock); net->sctp.addr_wq_timer.expires = 0; timer_setup(&net->sctp.addr_wq_timer, sctp_addr_wq_timeout_handler, 0); return 0; #ifdef CONFIG_PROC_FS err_init_proc: cleanup_sctp_mibs(net); #endif err_init_mibs: sctp_sysctl_net_unregister(net); err_sysctl_register: return status; } static void __net_exit sctp_defaults_exit(struct net *net) { /* Free the local address list */ sctp_free_addr_wq(net); sctp_free_local_addr_list(net); #ifdef CONFIG_PROC_FS remove_proc_subtree("sctp", net->proc_net); net->sctp.proc_net_sctp = NULL; #endif cleanup_sctp_mibs(net); sctp_sysctl_net_unregister(net); } static struct pernet_operations sctp_defaults_ops = { .init = sctp_defaults_init, .exit = sctp_defaults_exit, }; static int __net_init sctp_ctrlsock_init(struct net *net) { int status; /* Initialize the control inode/socket for handling OOTB packets. */ status = sctp_ctl_sock_init(net); if (status) pr_err("Failed to initialize the SCTP control sock\n"); return status; } static void __net_exit sctp_ctrlsock_exit(struct net *net) { /* Free the control endpoint. */ inet_ctl_sock_destroy(net->sctp.ctl_sock); } static struct pernet_operations sctp_ctrlsock_ops = { .init = sctp_ctrlsock_init, .exit = sctp_ctrlsock_exit, }; /* Initialize the universe into something sensible. */ static __init int sctp_init(void) { unsigned long nr_pages = totalram_pages(); unsigned long limit; unsigned long goal; int max_entry_order; int num_entries; int max_share; int status; int order; int i; sock_skb_cb_check_size(sizeof(struct sctp_ulpevent)); /* Allocate bind_bucket and chunk caches. */ status = -ENOBUFS; sctp_bucket_cachep = kmem_cache_create("sctp_bind_bucket", sizeof(struct sctp_bind_bucket), 0, SLAB_HWCACHE_ALIGN, NULL); if (!sctp_bucket_cachep) goto out; sctp_chunk_cachep = kmem_cache_create("sctp_chunk", sizeof(struct sctp_chunk), 0, SLAB_HWCACHE_ALIGN, NULL); if (!sctp_chunk_cachep) goto err_chunk_cachep; status = percpu_counter_init(&sctp_sockets_allocated, 0, GFP_KERNEL); if (status) goto err_percpu_counter_init; /* Implementation specific variables. */ /* Initialize default stream count setup information. */ sctp_max_instreams = SCTP_DEFAULT_INSTREAMS; sctp_max_outstreams = SCTP_DEFAULT_OUTSTREAMS; /* Initialize handle used for association ids. */ idr_init(&sctp_assocs_id); limit = nr_free_buffer_pages() / 8; limit = max(limit, 128UL); sysctl_sctp_mem[0] = limit / 4 * 3; sysctl_sctp_mem[1] = limit; sysctl_sctp_mem[2] = sysctl_sctp_mem[0] * 2; /* Set per-socket limits to no more than 1/128 the pressure threshold*/ limit = (sysctl_sctp_mem[1]) << (PAGE_SHIFT - 7); max_share = min(4UL*1024*1024, limit); sysctl_sctp_rmem[0] = PAGE_SIZE; /* give each asoc 1 page min */ sysctl_sctp_rmem[1] = 1500 * SKB_TRUESIZE(1); sysctl_sctp_rmem[2] = max(sysctl_sctp_rmem[1], max_share); sysctl_sctp_wmem[0] = PAGE_SIZE; sysctl_sctp_wmem[1] = 16*1024; sysctl_sctp_wmem[2] = max(64*1024, max_share); /* Size and allocate the association hash table. * The methodology is similar to that of the tcp hash tables. * Though not identical. Start by getting a goal size */ if (nr_pages >= (128 * 1024)) goal = nr_pages >> (22 - PAGE_SHIFT); else goal = nr_pages >> (24 - PAGE_SHIFT); /* Then compute the page order for said goal */ order = get_order(goal); /* Now compute the required page order for the maximum sized table we * want to create */ max_entry_order = get_order(MAX_SCTP_PORT_HASH_ENTRIES * sizeof(struct sctp_bind_hashbucket)); /* Limit the page order by that maximum hash table size */ order = min(order, max_entry_order); /* Allocate and initialize the endpoint hash table. */ sctp_ep_hashsize = 64; sctp_ep_hashtable = kmalloc_array(64, sizeof(struct sctp_hashbucket), GFP_KERNEL); if (!sctp_ep_hashtable) { pr_err("Failed endpoint_hash alloc\n"); status = -ENOMEM; goto err_ehash_alloc; } for (i = 0; i < sctp_ep_hashsize; i++) { rwlock_init(&sctp_ep_hashtable[i].lock); INIT_HLIST_HEAD(&sctp_ep_hashtable[i].chain); } /* Allocate and initialize the SCTP port hash table. * Note that order is initalized to start at the max sized * table we want to support. If we can't get that many pages * reduce the order and try again */ do { sctp_port_hashtable = (struct sctp_bind_hashbucket *) __get_free_pages(GFP_KERNEL | __GFP_NOWARN, order); } while (!sctp_port_hashtable && --order > 0); if (!sctp_port_hashtable) { pr_err("Failed bind hash alloc\n"); status = -ENOMEM; goto err_bhash_alloc; } /* Now compute the number of entries that will fit in the * port hash space we allocated */ num_entries = (1UL << order) * PAGE_SIZE / sizeof(struct sctp_bind_hashbucket); /* And finish by rounding it down to the nearest power of two. * This wastes some memory of course, but it's needed because * the hash function operates based on the assumption that * the number of entries is a power of two. */ sctp_port_hashsize = rounddown_pow_of_two(num_entries); for (i = 0; i < sctp_port_hashsize; i++) { spin_lock_init(&sctp_port_hashtable[i].lock); INIT_HLIST_HEAD(&sctp_port_hashtable[i].chain); } status = sctp_transport_hashtable_init(); if (status) goto err_thash_alloc; pr_info("Hash tables configured (bind %d/%d)\n", sctp_port_hashsize, num_entries); sctp_sysctl_register(); INIT_LIST_HEAD(&sctp_address_families); sctp_v4_pf_init(); sctp_v6_pf_init(); sctp_sched_ops_init(); status = register_pernet_subsys(&sctp_defaults_ops); if (status) goto err_register_defaults; status = sctp_v4_protosw_init(); if (status) goto err_protosw_init; status = sctp_v6_protosw_init(); if (status) goto err_v6_protosw_init; status = register_pernet_subsys(&sctp_ctrlsock_ops); if (status) goto err_register_ctrlsock; status = sctp_v4_add_protocol(); if (status) goto err_add_protocol; /* Register SCTP with inet6 layer. */ status = sctp_v6_add_protocol(); if (status) goto err_v6_add_protocol; if (sctp_offload_init() < 0) pr_crit("%s: Cannot add SCTP protocol offload\n", __func__); out: return status; err_v6_add_protocol: sctp_v4_del_protocol(); err_add_protocol: unregister_pernet_subsys(&sctp_ctrlsock_ops); err_register_ctrlsock: sctp_v6_protosw_exit(); err_v6_protosw_init: sctp_v4_protosw_exit(); err_protosw_init: unregister_pernet_subsys(&sctp_defaults_ops); err_register_defaults: sctp_v4_pf_exit(); sctp_v6_pf_exit(); sctp_sysctl_unregister(); free_pages((unsigned long)sctp_port_hashtable, get_order(sctp_port_hashsize * sizeof(struct sctp_bind_hashbucket))); err_bhash_alloc: sctp_transport_hashtable_destroy(); err_thash_alloc: kfree(sctp_ep_hashtable); err_ehash_alloc: percpu_counter_destroy(&sctp_sockets_allocated); err_percpu_counter_init: kmem_cache_destroy(sctp_chunk_cachep); err_chunk_cachep: kmem_cache_destroy(sctp_bucket_cachep); goto out; } /* Exit handler for the SCTP protocol. */ static __exit void sctp_exit(void) { /* BUG. This should probably do something useful like clean * up all the remaining associations and all that memory. */ /* Unregister with inet6/inet layers. */ sctp_v6_del_protocol(); sctp_v4_del_protocol(); unregister_pernet_subsys(&sctp_ctrlsock_ops); /* Free protosw registrations */ sctp_v6_protosw_exit(); sctp_v4_protosw_exit(); unregister_pernet_subsys(&sctp_defaults_ops); /* Unregister with socket layer. */ sctp_v6_pf_exit(); sctp_v4_pf_exit(); sctp_sysctl_unregister(); free_pages((unsigned long)sctp_port_hashtable, get_order(sctp_port_hashsize * sizeof(struct sctp_bind_hashbucket))); kfree(sctp_ep_hashtable); sctp_transport_hashtable_destroy(); percpu_counter_destroy(&sctp_sockets_allocated); rcu_barrier(); /* Wait for completion of call_rcu()'s */ kmem_cache_destroy(sctp_chunk_cachep); kmem_cache_destroy(sctp_bucket_cachep); } module_init(sctp_init); module_exit(sctp_exit); /* * __stringify doesn't likes enums, so use IPPROTO_SCTP value (132) directly. */ MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-132"); MODULE_ALIAS("net-pf-" __stringify(PF_INET6) "-proto-132"); MODULE_AUTHOR("Linux Kernel SCTP developers <linux-sctp@vger.kernel.org>"); MODULE_DESCRIPTION("Support for the SCTP protocol (RFC2960)"); module_param_named(no_checksums, sctp_checksum_disable, bool, 0644); MODULE_PARM_DESC(no_checksums, "Disable checksums computing and verification"); MODULE_LICENSE("GPL"); |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 | #ifndef _UAPI_LINUX_VIRTIO_RING_H #define _UAPI_LINUX_VIRTIO_RING_H /* An interface for efficient virtio implementation, currently for use by KVM, * but hopefully others soon. Do NOT change this since it will * break existing servers and clients. * * This header is BSD licensed so anyone can use the definitions to implement * compatible drivers/servers. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of IBM nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Copyright Rusty Russell IBM Corporation 2007. */ #ifndef __KERNEL__ #include <stdint.h> #endif #include <linux/types.h> #include <linux/virtio_types.h> /* This marks a buffer as continuing via the next field. */ #define VRING_DESC_F_NEXT 1 /* This marks a buffer as write-only (otherwise read-only). */ #define VRING_DESC_F_WRITE 2 /* This means the buffer contains a list of buffer descriptors. */ #define VRING_DESC_F_INDIRECT 4 /* * Mark a descriptor as available or used in packed ring. * Notice: they are defined as shifts instead of shifted values. */ #define VRING_PACKED_DESC_F_AVAIL 7 #define VRING_PACKED_DESC_F_USED 15 /* The Host uses this in used->flags to advise the Guest: don't kick me when * you add a buffer. It's unreliable, so it's simply an optimization. Guest * will still kick if it's out of buffers. */ #define VRING_USED_F_NO_NOTIFY 1 /* The Guest uses this in avail->flags to advise the Host: don't interrupt me * when you consume a buffer. It's unreliable, so it's simply an * optimization. */ #define VRING_AVAIL_F_NO_INTERRUPT 1 /* Enable events in packed ring. */ #define VRING_PACKED_EVENT_FLAG_ENABLE 0x0 /* Disable events in packed ring. */ #define VRING_PACKED_EVENT_FLAG_DISABLE 0x1 /* * Enable events for a specific descriptor in packed ring. * (as specified by Descriptor Ring Change Event Offset/Wrap Counter). * Only valid if VIRTIO_RING_F_EVENT_IDX has been negotiated. */ #define VRING_PACKED_EVENT_FLAG_DESC 0x2 /* * Wrap counter bit shift in event suppression structure * of packed ring. */ #define VRING_PACKED_EVENT_F_WRAP_CTR 15 /* We support indirect buffer descriptors */ #define VIRTIO_RING_F_INDIRECT_DESC 28 /* The Guest publishes the used index for which it expects an interrupt * at the end of the avail ring. Host should ignore the avail->flags field. */ /* The Host publishes the avail index for which it expects a kick * at the end of the used ring. Guest should ignore the used->flags field. */ #define VIRTIO_RING_F_EVENT_IDX 29 /* Alignment requirements for vring elements. * When using pre-virtio 1.0 layout, these fall out naturally. */ #define VRING_AVAIL_ALIGN_SIZE 2 #define VRING_USED_ALIGN_SIZE 4 #define VRING_DESC_ALIGN_SIZE 16 /** * struct vring_desc - Virtio ring descriptors, * 16 bytes long. These can chain together via @next. * * @addr: buffer address (guest-physical) * @len: buffer length * @flags: descriptor flags * @next: index of the next descriptor in the chain, * if the VRING_DESC_F_NEXT flag is set. We chain unused * descriptors via this, too. */ struct vring_desc { __virtio64 addr; __virtio32 len; __virtio16 flags; __virtio16 next; }; struct vring_avail { __virtio16 flags; __virtio16 idx; __virtio16 ring[]; }; /* u32 is used here for ids for padding reasons. */ struct vring_used_elem { /* Index of start of used descriptor chain. */ __virtio32 id; /* Total length of the descriptor chain which was used (written to) */ __virtio32 len; }; typedef struct vring_used_elem __attribute__((aligned(VRING_USED_ALIGN_SIZE))) vring_used_elem_t; struct vring_used { __virtio16 flags; __virtio16 idx; vring_used_elem_t ring[]; }; /* * The ring element addresses are passed between components with different * alignments assumptions. Thus, we might need to decrease the compiler-selected * alignment, and so must use a typedef to make sure the aligned attribute * actually takes hold: * * https://gcc.gnu.org/onlinedocs//gcc/Common-Type-Attributes.html#Common-Type-Attributes * * When used on a struct, or struct member, the aligned attribute can only * increase the alignment; in order to decrease it, the packed attribute must * be specified as well. When used as part of a typedef, the aligned attribute * can both increase and decrease alignment, and specifying the packed * attribute generates a warning. */ typedef struct vring_desc __attribute__((aligned(VRING_DESC_ALIGN_SIZE))) vring_desc_t; typedef struct vring_avail __attribute__((aligned(VRING_AVAIL_ALIGN_SIZE))) vring_avail_t; typedef struct vring_used __attribute__((aligned(VRING_USED_ALIGN_SIZE))) vring_used_t; struct vring { unsigned int num; vring_desc_t *desc; vring_avail_t *avail; vring_used_t *used; }; #ifndef VIRTIO_RING_NO_LEGACY /* The standard layout for the ring is a continuous chunk of memory which looks * like this. We assume num is a power of 2. * * struct vring * { * // The actual descriptors (16 bytes each) * struct vring_desc desc[num]; * * // A ring of available descriptor heads with free-running index. * __virtio16 avail_flags; * __virtio16 avail_idx; * __virtio16 available[num]; * __virtio16 used_event_idx; * * // Padding to the next align boundary. * char pad[]; * * // A ring of used descriptor heads with free-running index. * __virtio16 used_flags; * __virtio16 used_idx; * struct vring_used_elem used[num]; * __virtio16 avail_event_idx; * }; */ /* We publish the used event index at the end of the available ring, and vice * versa. They are at the end for backwards compatibility. */ #define vring_used_event(vr) ((vr)->avail->ring[(vr)->num]) #define vring_avail_event(vr) (*(__virtio16 *)&(vr)->used->ring[(vr)->num]) static inline void vring_init(struct vring *vr, unsigned int num, void *p, unsigned long align) { vr->num = num; vr->desc = p; vr->avail = (struct vring_avail *)((char *)p + num * sizeof(struct vring_desc)); vr->used = (void *)(((uintptr_t)&vr->avail->ring[num] + sizeof(__virtio16) + align-1) & ~(align - 1)); } static inline unsigned vring_size(unsigned int num, unsigned long align) { return ((sizeof(struct vring_desc) * num + sizeof(__virtio16) * (3 + num) + align - 1) & ~(align - 1)) + sizeof(__virtio16) * 3 + sizeof(struct vring_used_elem) * num; } #endif /* VIRTIO_RING_NO_LEGACY */ /* The following is used with USED_EVENT_IDX and AVAIL_EVENT_IDX */ /* Assuming a given event_idx value from the other side, if * we have just incremented index from old to new_idx, * should we trigger an event? */ static inline int vring_need_event(__u16 event_idx, __u16 new_idx, __u16 old) { /* Note: Xen has similar logic for notification hold-off * in include/xen/interface/io/ring.h with req_event and req_prod * corresponding to event_idx + 1 and new_idx respectively. * Note also that req_event and req_prod in Xen start at 1, * event indexes in virtio start at 0. */ return (__u16)(new_idx - event_idx - 1) < (__u16)(new_idx - old); } struct vring_packed_desc_event { /* Descriptor Ring Change Event Offset/Wrap Counter. */ __le16 off_wrap; /* Descriptor Ring Change Event Flags. */ __le16 flags; }; struct vring_packed_desc { /* Buffer Address. */ __le64 addr; /* Buffer Length. */ __le32 len; /* Buffer ID. */ __le16 id; /* The flags depending on descriptor type. */ __le16 flags; }; #endif /* _UAPI_LINUX_VIRTIO_RING_H */ |
1311 1072 1050 1072 1311 1290 1291 505 1291 1291 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * sha256_base.h - core logic for SHA-256 implementations * * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org> */ #ifndef _CRYPTO_SHA256_BASE_H #define _CRYPTO_SHA256_BASE_H #include <asm/byteorder.h> #include <asm/unaligned.h> #include <crypto/internal/hash.h> #include <crypto/sha2.h> #include <linux/string.h> #include <linux/types.h> typedef void (sha256_block_fn)(struct sha256_state *sst, u8 const *src, int blocks); static inline int sha224_base_init(struct shash_desc *desc) { struct sha256_state *sctx = shash_desc_ctx(desc); sha224_init(sctx); return 0; } static inline int sha256_base_init(struct shash_desc *desc) { struct sha256_state *sctx = shash_desc_ctx(desc); sha256_init(sctx); return 0; } static inline int lib_sha256_base_do_update(struct sha256_state *sctx, const u8 *data, unsigned int len, sha256_block_fn *block_fn) { unsigned int partial = sctx->count % SHA256_BLOCK_SIZE; sctx->count += len; if (unlikely((partial + len) >= SHA256_BLOCK_SIZE)) { int blocks; if (partial) { int p = SHA256_BLOCK_SIZE - partial; memcpy(sctx->buf + partial, data, p); data += p; len -= p; block_fn(sctx, sctx->buf, 1); } blocks = len / SHA256_BLOCK_SIZE; len %= SHA256_BLOCK_SIZE; if (blocks) { block_fn(sctx, data, blocks); data += blocks * SHA256_BLOCK_SIZE; } partial = 0; } if (len) memcpy(sctx->buf + partial, data, len); return 0; } static inline int sha256_base_do_update(struct shash_desc *desc, const u8 *data, unsigned int len, sha256_block_fn *block_fn) { struct sha256_state *sctx = shash_desc_ctx(desc); return lib_sha256_base_do_update(sctx, data, len, block_fn); } static inline int lib_sha256_base_do_finalize(struct sha256_state *sctx, sha256_block_fn *block_fn) { const int bit_offset = SHA256_BLOCK_SIZE - sizeof(__be64); __be64 *bits = (__be64 *)(sctx->buf + bit_offset); unsigned int partial = sctx->count % SHA256_BLOCK_SIZE; sctx->buf[partial++] = 0x80; if (partial > bit_offset) { memset(sctx->buf + partial, 0x0, SHA256_BLOCK_SIZE - partial); partial = 0; block_fn(sctx, sctx->buf, 1); } memset(sctx->buf + partial, 0x0, bit_offset - partial); *bits = cpu_to_be64(sctx->count << 3); block_fn(sctx, sctx->buf, 1); return 0; } static inline int sha256_base_do_finalize(struct shash_desc *desc, sha256_block_fn *block_fn) { struct sha256_state *sctx = shash_desc_ctx(desc); return lib_sha256_base_do_finalize(sctx, block_fn); } static inline int lib_sha256_base_finish(struct sha256_state *sctx, u8 *out, unsigned int digest_size) { __be32 *digest = (__be32 *)out; int i; for (i = 0; digest_size > 0; i++, digest_size -= sizeof(__be32)) put_unaligned_be32(sctx->state[i], digest++); memzero_explicit(sctx, sizeof(*sctx)); return 0; } static inline int sha256_base_finish(struct shash_desc *desc, u8 *out) { unsigned int digest_size = crypto_shash_digestsize(desc->tfm); struct sha256_state *sctx = shash_desc_ctx(desc); return lib_sha256_base_finish(sctx, out, digest_size); } #endif /* _CRYPTO_SHA256_BASE_H */ |
31 25 6 252 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM mmap_lock #if !defined(_TRACE_MMAP_LOCK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_MMAP_LOCK_H #include <linux/tracepoint.h> #include <linux/types.h> struct mm_struct; extern int trace_mmap_lock_reg(void); extern void trace_mmap_lock_unreg(void); DECLARE_EVENT_CLASS(mmap_lock, TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write), TP_ARGS(mm, memcg_path, write), TP_STRUCT__entry( __field(struct mm_struct *, mm) __string(memcg_path, memcg_path) __field(bool, write) ), TP_fast_assign( __entry->mm = mm; __assign_str(memcg_path, memcg_path); __entry->write = write; ), TP_printk( "mm=%p memcg_path=%s write=%s", __entry->mm, __get_str(memcg_path), __entry->write ? "true" : "false" ) ); #define DEFINE_MMAP_LOCK_EVENT(name) \ DEFINE_EVENT_FN(mmap_lock, name, \ TP_PROTO(struct mm_struct *mm, const char *memcg_path, \ bool write), \ TP_ARGS(mm, memcg_path, write), \ trace_mmap_lock_reg, trace_mmap_lock_unreg) DEFINE_MMAP_LOCK_EVENT(mmap_lock_start_locking); DEFINE_MMAP_LOCK_EVENT(mmap_lock_released); TRACE_EVENT_FN(mmap_lock_acquire_returned, TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write, bool success), TP_ARGS(mm, memcg_path, write, success), TP_STRUCT__entry( __field(struct mm_struct *, mm) __string(memcg_path, memcg_path) __field(bool, write) __field(bool, success) ), TP_fast_assign( __entry->mm = mm; __assign_str(memcg_path, memcg_path); __entry->write = write; __entry->success = success; ), TP_printk( "mm=%p memcg_path=%s write=%s success=%s", __entry->mm, __get_str(memcg_path), __entry->write ? "true" : "false", __entry->success ? "true" : "false" ), trace_mmap_lock_reg, trace_mmap_lock_unreg ); #endif /* _TRACE_MMAP_LOCK_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
1021 40 980 548 40 506 1017 488 1019 100 100 100 100 100 100 100 100 100 100 30 58 58 58 58 100 100 100 100 100 6 8 8 58 34 34 100 8 100 57 100 100 100 96 75 74 884 884 883 883 882 884 379 379 33 489 170 3 488 34 34 1 33 2 1 1 1 1 486 486 487 486 485 488 1032 1032 1030 1029 1029 2 30 40 39 38 36 35 33 31 2 30 31 3 30 3 2 1 2 30 31 40 1045 1013 38 1045 3 27 1 485 15 484 6 6 5 5 24 801 9 9 2 321 4 803 805 804 1 802 800 802 802 15 4 802 94 94 802 802 796 321 1 320 321 320 326 323 2 2 1 1 1 1 2 132 2 1 200 198 199 198 107 30 91 196 195 6 4 3 2 2 190 20 190 189 18 18 18 18 18 187 133 186 99 87 132 132 129 133 79 133 9 132 138 144 38 37 36 36 35 1 34 3 33 33 29 29 28 30 34 18 17 16 13 12 11 10 9 1 8 7 1 7 7 7 7 7 7 7 7 7 10 18 29 28 27 25 24 23 22 3 21 21 2 21 29 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 | // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2017 Facebook */ #include <linux/bpf.h> #include <linux/btf.h> #include <linux/btf_ids.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/vmalloc.h> #include <linux/etherdevice.h> #include <linux/filter.h> #include <linux/rcupdate_trace.h> #include <linux/sched/signal.h> #include <net/bpf_sk_storage.h> #include <net/sock.h> #include <net/tcp.h> #include <net/net_namespace.h> #include <net/page_pool/helpers.h> #include <linux/error-injection.h> #include <linux/smp.h> #include <linux/sock_diag.h> #include <linux/netfilter.h> #include <net/netdev_rx_queue.h> #include <net/xdp.h> #include <net/netfilter/nf_bpf_link.h> #define CREATE_TRACE_POINTS #include <trace/events/bpf_test_run.h> struct bpf_test_timer { enum { NO_PREEMPT, NO_MIGRATE } mode; u32 i; u64 time_start, time_spent; }; static void bpf_test_timer_enter(struct bpf_test_timer *t) __acquires(rcu) { rcu_read_lock(); if (t->mode == NO_PREEMPT) preempt_disable(); else migrate_disable(); t->time_start = ktime_get_ns(); } static void bpf_test_timer_leave(struct bpf_test_timer *t) __releases(rcu) { t->time_start = 0; if (t->mode == NO_PREEMPT) preempt_enable(); else migrate_enable(); rcu_read_unlock(); } static bool bpf_test_timer_continue(struct bpf_test_timer *t, int iterations, u32 repeat, int *err, u32 *duration) __must_hold(rcu) { t->i += iterations; if (t->i >= repeat) { /* We're done. */ t->time_spent += ktime_get_ns() - t->time_start; do_div(t->time_spent, t->i); *duration = t->time_spent > U32_MAX ? U32_MAX : (u32)t->time_spent; *err = 0; goto reset; } if (signal_pending(current)) { /* During iteration: we've been cancelled, abort. */ *err = -EINTR; goto reset; } if (need_resched()) { /* During iteration: we need to reschedule between runs. */ t->time_spent += ktime_get_ns() - t->time_start; bpf_test_timer_leave(t); cond_resched(); bpf_test_timer_enter(t); } /* Do another round. */ return true; reset: t->i = 0; return false; } /* We put this struct at the head of each page with a context and frame * initialised when the page is allocated, so we don't have to do this on each * repetition of the test run. */ struct xdp_page_head { struct xdp_buff orig_ctx; struct xdp_buff ctx; union { /* ::data_hard_start starts here */ DECLARE_FLEX_ARRAY(struct xdp_frame, frame); DECLARE_FLEX_ARRAY(u8, data); }; }; struct xdp_test_data { struct xdp_buff *orig_ctx; struct xdp_rxq_info rxq; struct net_device *dev; struct page_pool *pp; struct xdp_frame **frames; struct sk_buff **skbs; struct xdp_mem_info mem; u32 batch_size; u32 frame_cnt; }; /* tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c:%MAX_PKT_SIZE * must be updated accordingly this gets changed, otherwise BPF selftests * will fail. */ #define TEST_XDP_FRAME_SIZE (PAGE_SIZE - sizeof(struct xdp_page_head)) #define TEST_XDP_MAX_BATCH 256 static void xdp_test_run_init_page(struct page *page, void *arg) { struct xdp_page_head *head = phys_to_virt(page_to_phys(page)); struct xdp_buff *new_ctx, *orig_ctx; u32 headroom = XDP_PACKET_HEADROOM; struct xdp_test_data *xdp = arg; size_t frm_len, meta_len; struct xdp_frame *frm; void *data; orig_ctx = xdp->orig_ctx; frm_len = orig_ctx->data_end - orig_ctx->data_meta; meta_len = orig_ctx->data - orig_ctx->data_meta; headroom -= meta_len; new_ctx = &head->ctx; frm = head->frame; data = head->data; memcpy(data + headroom, orig_ctx->data_meta, frm_len); xdp_init_buff(new_ctx, TEST_XDP_FRAME_SIZE, &xdp->rxq); xdp_prepare_buff(new_ctx, data, headroom, frm_len, true); new_ctx->data = new_ctx->data_meta + meta_len; xdp_update_frame_from_buff(new_ctx, frm); frm->mem = new_ctx->rxq->mem; memcpy(&head->orig_ctx, new_ctx, sizeof(head->orig_ctx)); } static int xdp_test_run_setup(struct xdp_test_data *xdp, struct xdp_buff *orig_ctx) { struct page_pool *pp; int err = -ENOMEM; struct page_pool_params pp_params = { .order = 0, .flags = 0, .pool_size = xdp->batch_size, .nid = NUMA_NO_NODE, .init_callback = xdp_test_run_init_page, .init_arg = xdp, }; xdp->frames = kvmalloc_array(xdp->batch_size, sizeof(void *), GFP_KERNEL); if (!xdp->frames) return -ENOMEM; xdp->skbs = kvmalloc_array(xdp->batch_size, sizeof(void *), GFP_KERNEL); if (!xdp->skbs) goto err_skbs; pp = page_pool_create(&pp_params); if (IS_ERR(pp)) { err = PTR_ERR(pp); goto err_pp; } /* will copy 'mem.id' into pp->xdp_mem_id */ err = xdp_reg_mem_model(&xdp->mem, MEM_TYPE_PAGE_POOL, pp); if (err) goto err_mmodel; xdp->pp = pp; /* We create a 'fake' RXQ referencing the original dev, but with an * xdp_mem_info pointing to our page_pool */ xdp_rxq_info_reg(&xdp->rxq, orig_ctx->rxq->dev, 0, 0); xdp->rxq.mem.type = MEM_TYPE_PAGE_POOL; xdp->rxq.mem.id = pp->xdp_mem_id; xdp->dev = orig_ctx->rxq->dev; xdp->orig_ctx = orig_ctx; return 0; err_mmodel: page_pool_destroy(pp); err_pp: kvfree(xdp->skbs); err_skbs: kvfree(xdp->frames); return err; } static void xdp_test_run_teardown(struct xdp_test_data *xdp) { xdp_unreg_mem_model(&xdp->mem); page_pool_destroy(xdp->pp); kfree(xdp->frames); kfree(xdp->skbs); } static bool frame_was_changed(const struct xdp_page_head *head) { /* xdp_scrub_frame() zeroes the data pointer, flags is the last field, * i.e. has the highest chances to be overwritten. If those two are * untouched, it's most likely safe to skip the context reset. */ return head->frame->data != head->orig_ctx.data || head->frame->flags != head->orig_ctx.flags; } static bool ctx_was_changed(struct xdp_page_head *head) { return head->orig_ctx.data != head->ctx.data || head->orig_ctx.data_meta != head->ctx.data_meta || head->orig_ctx.data_end != head->ctx.data_end; } static void reset_ctx(struct xdp_page_head *head) { if (likely(!frame_was_changed(head) && !ctx_was_changed(head))) return; head->ctx.data = head->orig_ctx.data; head->ctx.data_meta = head->orig_ctx.data_meta; head->ctx.data_end = head->orig_ctx.data_end; xdp_update_frame_from_buff(&head->ctx, head->frame); } static int xdp_recv_frames(struct xdp_frame **frames, int nframes, struct sk_buff **skbs, struct net_device *dev) { gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; int i, n; LIST_HEAD(list); n = kmem_cache_alloc_bulk(skbuff_cache, gfp, nframes, (void **)skbs); if (unlikely(n == 0)) { for (i = 0; i < nframes; i++) xdp_return_frame(frames[i]); return -ENOMEM; } for (i = 0; i < nframes; i++) { struct xdp_frame *xdpf = frames[i]; struct sk_buff *skb = skbs[i]; skb = __xdp_build_skb_from_frame(xdpf, skb, dev); if (!skb) { xdp_return_frame(xdpf); continue; } list_add_tail(&skb->list, &list); } netif_receive_skb_list(&list); return 0; } static int xdp_test_run_batch(struct xdp_test_data *xdp, struct bpf_prog *prog, u32 repeat) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); int err = 0, act, ret, i, nframes = 0, batch_sz; struct xdp_frame **frames = xdp->frames; struct xdp_page_head *head; struct xdp_frame *frm; bool redirect = false; struct xdp_buff *ctx; struct page *page; batch_sz = min_t(u32, repeat, xdp->batch_size); local_bh_disable(); xdp_set_return_frame_no_direct(); for (i = 0; i < batch_sz; i++) { page = page_pool_dev_alloc_pages(xdp->pp); if (!page) { err = -ENOMEM; goto out; } head = phys_to_virt(page_to_phys(page)); reset_ctx(head); ctx = &head->ctx; frm = head->frame; xdp->frame_cnt++; act = bpf_prog_run_xdp(prog, ctx); /* if program changed pkt bounds we need to update the xdp_frame */ if (unlikely(ctx_was_changed(head))) { ret = xdp_update_frame_from_buff(ctx, frm); if (ret) { xdp_return_buff(ctx); continue; } } switch (act) { case XDP_TX: /* we can't do a real XDP_TX since we're not in the * driver, so turn it into a REDIRECT back to the same * index */ ri->tgt_index = xdp->dev->ifindex; ri->map_id = INT_MAX; ri->map_type = BPF_MAP_TYPE_UNSPEC; fallthrough; case XDP_REDIRECT: redirect = true; ret = xdp_do_redirect_frame(xdp->dev, ctx, frm, prog); if (ret) xdp_return_buff(ctx); break; case XDP_PASS: frames[nframes++] = frm; break; default: bpf_warn_invalid_xdp_action(NULL, prog, act); fallthrough; case XDP_DROP: xdp_return_buff(ctx); break; } } out: if (redirect) xdp_do_flush(); if (nframes) { ret = xdp_recv_frames(frames, nframes, xdp->skbs, xdp->dev); if (ret) err = ret; } xdp_clear_return_frame_no_direct(); local_bh_enable(); return err; } static int bpf_test_run_xdp_live(struct bpf_prog *prog, struct xdp_buff *ctx, u32 repeat, u32 batch_size, u32 *time) { struct xdp_test_data xdp = { .batch_size = batch_size }; struct bpf_test_timer t = { .mode = NO_MIGRATE }; int ret; if (!repeat) repeat = 1; ret = xdp_test_run_setup(&xdp, ctx); if (ret) return ret; bpf_test_timer_enter(&t); do { xdp.frame_cnt = 0; ret = xdp_test_run_batch(&xdp, prog, repeat - t.i); if (unlikely(ret < 0)) break; } while (bpf_test_timer_continue(&t, xdp.frame_cnt, repeat, &ret, time)); bpf_test_timer_leave(&t); xdp_test_run_teardown(&xdp); return ret; } static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *retval, u32 *time, bool xdp) { struct bpf_prog_array_item item = {.prog = prog}; struct bpf_run_ctx *old_ctx; struct bpf_cg_run_ctx run_ctx; struct bpf_test_timer t = { NO_MIGRATE }; enum bpf_cgroup_storage_type stype; int ret; for_each_cgroup_storage_type(stype) { item.cgroup_storage[stype] = bpf_cgroup_storage_alloc(prog, stype); if (IS_ERR(item.cgroup_storage[stype])) { item.cgroup_storage[stype] = NULL; for_each_cgroup_storage_type(stype) bpf_cgroup_storage_free(item.cgroup_storage[stype]); return -ENOMEM; } } if (!repeat) repeat = 1; bpf_test_timer_enter(&t); old_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); do { run_ctx.prog_item = &item; local_bh_disable(); if (xdp) *retval = bpf_prog_run_xdp(prog, ctx); else *retval = bpf_prog_run(prog, ctx); local_bh_enable(); } while (bpf_test_timer_continue(&t, 1, repeat, &ret, time)); bpf_reset_run_ctx(old_ctx); bpf_test_timer_leave(&t); for_each_cgroup_storage_type(stype) bpf_cgroup_storage_free(item.cgroup_storage[stype]); return ret; } static int bpf_test_finish(const union bpf_attr *kattr, union bpf_attr __user *uattr, const void *data, struct skb_shared_info *sinfo, u32 size, u32 retval, u32 duration) { void __user *data_out = u64_to_user_ptr(kattr->test.data_out); int err = -EFAULT; u32 copy_size = size; /* Clamp copy if the user has provided a size hint, but copy the full * buffer if not to retain old behaviour. */ if (kattr->test.data_size_out && copy_size > kattr->test.data_size_out) { copy_size = kattr->test.data_size_out; err = -ENOSPC; } if (data_out) { int len = sinfo ? copy_size - sinfo->xdp_frags_size : copy_size; if (len < 0) { err = -ENOSPC; goto out; } if (copy_to_user(data_out, data, len)) goto out; if (sinfo) { int i, offset = len; u32 data_len; for (i = 0; i < sinfo->nr_frags; i++) { skb_frag_t *frag = &sinfo->frags[i]; if (offset >= copy_size) { err = -ENOSPC; break; } data_len = min_t(u32, copy_size - offset, skb_frag_size(frag)); if (copy_to_user(data_out + offset, skb_frag_address(frag), data_len)) goto out; offset += data_len; } } } if (copy_to_user(&uattr->test.data_size_out, &size, sizeof(size))) goto out; if (copy_to_user(&uattr->test.retval, &retval, sizeof(retval))) goto out; if (copy_to_user(&uattr->test.duration, &duration, sizeof(duration))) goto out; if (err != -ENOSPC) err = 0; out: trace_bpf_test_finish(&err); return err; } /* Integer types of various sizes and pointer combinations cover variety of * architecture dependent calling conventions. 7+ can be supported in the * future. */ __diag_push(); __diag_ignore_all("-Wmissing-prototypes", "Global functions as their definitions will be in vmlinux BTF"); __bpf_kfunc int bpf_fentry_test1(int a) { return a + 1; } EXPORT_SYMBOL_GPL(bpf_fentry_test1); int noinline bpf_fentry_test2(int a, u64 b) { return a + b; } int noinline bpf_fentry_test3(char a, int b, u64 c) { return a + b + c; } int noinline bpf_fentry_test4(void *a, char b, int c, u64 d) { return (long)a + b + c + d; } int noinline bpf_fentry_test5(u64 a, void *b, short c, int d, u64 e) { return a + (long)b + c + d + e; } int noinline bpf_fentry_test6(u64 a, void *b, short c, int d, void *e, u64 f) { return a + (long)b + c + d + (long)e + f; } struct bpf_fentry_test_t { struct bpf_fentry_test_t *a; }; int noinline bpf_fentry_test7(struct bpf_fentry_test_t *arg) { asm volatile (""); return (long)arg; } int noinline bpf_fentry_test8(struct bpf_fentry_test_t *arg) { return (long)arg->a; } __bpf_kfunc u32 bpf_fentry_test9(u32 *a) { return *a; } void noinline bpf_fentry_test_sinfo(struct skb_shared_info *sinfo) { } __bpf_kfunc int bpf_modify_return_test(int a, int *b) { *b += 1; return a + *b; } __bpf_kfunc int bpf_modify_return_test2(int a, int *b, short c, int d, void *e, char f, int g) { *b += 1; return a + *b + c + d + (long)e + f + g; } int noinline bpf_fentry_shadow_test(int a) { return a + 1; } struct prog_test_member1 { int a; }; struct prog_test_member { struct prog_test_member1 m; int c; }; struct prog_test_ref_kfunc { int a; int b; struct prog_test_member memb; struct prog_test_ref_kfunc *next; refcount_t cnt; }; __bpf_kfunc void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p) { refcount_dec(&p->cnt); } __bpf_kfunc void bpf_kfunc_call_memb_release(struct prog_test_member *p) { } __diag_pop(); BTF_SET8_START(bpf_test_modify_return_ids) BTF_ID_FLAGS(func, bpf_modify_return_test) BTF_ID_FLAGS(func, bpf_modify_return_test2) BTF_ID_FLAGS(func, bpf_fentry_test1, KF_SLEEPABLE) BTF_SET8_END(bpf_test_modify_return_ids) static const struct btf_kfunc_id_set bpf_test_modify_return_set = { .owner = THIS_MODULE, .set = &bpf_test_modify_return_ids, }; BTF_SET8_START(test_sk_check_kfunc_ids) BTF_ID_FLAGS(func, bpf_kfunc_call_test_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_kfunc_call_memb_release, KF_RELEASE) BTF_SET8_END(test_sk_check_kfunc_ids) static void *bpf_test_init(const union bpf_attr *kattr, u32 user_size, u32 size, u32 headroom, u32 tailroom) { void __user *data_in = u64_to_user_ptr(kattr->test.data_in); void *data; if (size < ETH_HLEN || size > PAGE_SIZE - headroom - tailroom) return ERR_PTR(-EINVAL); if (user_size > size) return ERR_PTR(-EMSGSIZE); size = SKB_DATA_ALIGN(size); data = kzalloc(size + headroom + tailroom, GFP_USER); if (!data) return ERR_PTR(-ENOMEM); if (copy_from_user(data + headroom, data_in, user_size)) { kfree(data); return ERR_PTR(-EFAULT); } return data; } int bpf_prog_test_run_tracing(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { struct bpf_fentry_test_t arg = {}; u16 side_effect = 0, ret = 0; int b = 2, err = -EFAULT; u32 retval = 0; if (kattr->test.flags || kattr->test.cpu || kattr->test.batch_size) return -EINVAL; switch (prog->expected_attach_type) { case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: if (bpf_fentry_test1(1) != 2 || bpf_fentry_test2(2, 3) != 5 || bpf_fentry_test3(4, 5, 6) != 15 || bpf_fentry_test4((void *)7, 8, 9, 10) != 34 || bpf_fentry_test5(11, (void *)12, 13, 14, 15) != 65 || bpf_fentry_test6(16, (void *)17, 18, 19, (void *)20, 21) != 111 || bpf_fentry_test7((struct bpf_fentry_test_t *)0) != 0 || bpf_fentry_test8(&arg) != 0 || bpf_fentry_test9(&retval) != 0) goto out; break; case BPF_MODIFY_RETURN: ret = bpf_modify_return_test(1, &b); if (b != 2) side_effect++; b = 2; ret += bpf_modify_return_test2(1, &b, 3, 4, (void *)5, 6, 7); if (b != 2) side_effect++; break; default: goto out; } retval = ((u32)side_effect << 16) | ret; if (copy_to_user(&uattr->test.retval, &retval, sizeof(retval))) goto out; err = 0; out: trace_bpf_test_finish(&err); return err; } struct bpf_raw_tp_test_run_info { struct bpf_prog *prog; void *ctx; u32 retval; }; static void __bpf_prog_test_run_raw_tp(void *data) { struct bpf_raw_tp_test_run_info *info = data; rcu_read_lock(); info->retval = bpf_prog_run(info->prog, info->ctx); rcu_read_unlock(); } int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { void __user *ctx_in = u64_to_user_ptr(kattr->test.ctx_in); __u32 ctx_size_in = kattr->test.ctx_size_in; struct bpf_raw_tp_test_run_info info; int cpu = kattr->test.cpu, err = 0; int current_cpu; /* doesn't support data_in/out, ctx_out, duration, or repeat */ if (kattr->test.data_in || kattr->test.data_out || kattr->test.ctx_out || kattr->test.duration || kattr->test.repeat || kattr->test.batch_size) return -EINVAL; if (ctx_size_in < prog->aux->max_ctx_offset || ctx_size_in > MAX_BPF_FUNC_ARGS * sizeof(u64)) return -EINVAL; if ((kattr->test.flags & BPF_F_TEST_RUN_ON_CPU) == 0 && cpu != 0) return -EINVAL; if (ctx_size_in) { info.ctx = memdup_user(ctx_in, ctx_size_in); if (IS_ERR(info.ctx)) return PTR_ERR(info.ctx); } else { info.ctx = NULL; } info.prog = prog; current_cpu = get_cpu(); if ((kattr->test.flags & BPF_F_TEST_RUN_ON_CPU) == 0 || cpu == current_cpu) { __bpf_prog_test_run_raw_tp(&info); } else if (cpu >= nr_cpu_ids || !cpu_online(cpu)) { /* smp_call_function_single() also checks cpu_online() * after csd_lock(). However, since cpu is from user * space, let's do an extra quick check to filter out * invalid value before smp_call_function_single(). */ err = -ENXIO; } else { err = smp_call_function_single(cpu, __bpf_prog_test_run_raw_tp, &info, 1); } put_cpu(); if (!err && copy_to_user(&uattr->test.retval, &info.retval, sizeof(u32))) err = -EFAULT; kfree(info.ctx); return err; } static void *bpf_ctx_init(const union bpf_attr *kattr, u32 max_size) { void __user *data_in = u64_to_user_ptr(kattr->test.ctx_in); void __user *data_out = u64_to_user_ptr(kattr->test.ctx_out); u32 size = kattr->test.ctx_size_in; void *data; int err; if (!data_in && !data_out) return NULL; data = kzalloc(max_size, GFP_USER); if (!data) return ERR_PTR(-ENOMEM); if (data_in) { err = bpf_check_uarg_tail_zero(USER_BPFPTR(data_in), max_size, size); if (err) { kfree(data); return ERR_PTR(err); } size = min_t(u32, max_size, size); if (copy_from_user(data, data_in, size)) { kfree(data); return ERR_PTR(-EFAULT); } } return data; } static int bpf_ctx_finish(const union bpf_attr *kattr, union bpf_attr __user *uattr, const void *data, u32 size) { void __user *data_out = u64_to_user_ptr(kattr->test.ctx_out); int err = -EFAULT; u32 copy_size = size; if (!data || !data_out) return 0; if (copy_size > kattr->test.ctx_size_out) { copy_size = kattr->test.ctx_size_out; err = -ENOSPC; } if (copy_to_user(data_out, data, copy_size)) goto out; if (copy_to_user(&uattr->test.ctx_size_out, &size, sizeof(size))) goto out; if (err != -ENOSPC) err = 0; out: return err; } /** * range_is_zero - test whether buffer is initialized * @buf: buffer to check * @from: check from this position * @to: check up until (excluding) this position * * This function returns true if the there is a non-zero byte * in the buf in the range [from,to). */ static inline bool range_is_zero(void *buf, size_t from, size_t to) { return !memchr_inv((u8 *)buf + from, 0, to - from); } static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) { struct qdisc_skb_cb *cb = (struct qdisc_skb_cb *)skb->cb; if (!__skb) return 0; /* make sure the fields we don't use are zeroed */ if (!range_is_zero(__skb, 0, offsetof(struct __sk_buff, mark))) return -EINVAL; /* mark is allowed */ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, mark), offsetof(struct __sk_buff, priority))) return -EINVAL; /* priority is allowed */ /* ingress_ifindex is allowed */ /* ifindex is allowed */ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, ifindex), offsetof(struct __sk_buff, cb))) return -EINVAL; /* cb is allowed */ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, cb), offsetof(struct __sk_buff, tstamp))) return -EINVAL; /* tstamp is allowed */ /* wire_len is allowed */ /* gso_segs is allowed */ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, gso_segs), offsetof(struct __sk_buff, gso_size))) return -EINVAL; /* gso_size is allowed */ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, gso_size), offsetof(struct __sk_buff, hwtstamp))) return -EINVAL; /* hwtstamp is allowed */ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, hwtstamp), sizeof(struct __sk_buff))) return -EINVAL; skb->mark = __skb->mark; skb->priority = __skb->priority; skb->skb_iif = __skb->ingress_ifindex; skb->tstamp = __skb->tstamp; memcpy(&cb->data, __skb->cb, QDISC_CB_PRIV_LEN); if (__skb->wire_len == 0) { cb->pkt_len = skb->len; } else { if (__skb->wire_len < skb->len || __skb->wire_len > GSO_LEGACY_MAX_SIZE) return -EINVAL; cb->pkt_len = __skb->wire_len; } if (__skb->gso_segs > GSO_MAX_SEGS) return -EINVAL; skb_shinfo(skb)->gso_segs = __skb->gso_segs; skb_shinfo(skb)->gso_size = __skb->gso_size; skb_shinfo(skb)->hwtstamps.hwtstamp = __skb->hwtstamp; return 0; } static void convert_skb_to___skb(struct sk_buff *skb, struct __sk_buff *__skb) { struct qdisc_skb_cb *cb = (struct qdisc_skb_cb *)skb->cb; if (!__skb) return; __skb->mark = skb->mark; __skb->priority = skb->priority; __skb->ingress_ifindex = skb->skb_iif; __skb->ifindex = skb->dev->ifindex; __skb->tstamp = skb->tstamp; memcpy(__skb->cb, &cb->data, QDISC_CB_PRIV_LEN); __skb->wire_len = cb->pkt_len; __skb->gso_segs = skb_shinfo(skb)->gso_segs; __skb->hwtstamp = skb_shinfo(skb)->hwtstamps.hwtstamp; } static struct proto bpf_dummy_proto = { .name = "bpf_dummy", .owner = THIS_MODULE, .obj_size = sizeof(struct sock), }; int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { bool is_l2 = false, is_direct_pkt_access = false; struct net *net = current->nsproxy->net_ns; struct net_device *dev = net->loopback_dev; u32 size = kattr->test.data_size_in; u32 repeat = kattr->test.repeat; struct __sk_buff *ctx = NULL; u32 retval, duration; int hh_len = ETH_HLEN; struct sk_buff *skb; struct sock *sk; void *data; int ret; if (kattr->test.flags || kattr->test.cpu || kattr->test.batch_size) return -EINVAL; data = bpf_test_init(kattr, kattr->test.data_size_in, size, NET_SKB_PAD + NET_IP_ALIGN, SKB_DATA_ALIGN(sizeof(struct skb_shared_info))); if (IS_ERR(data)) return PTR_ERR(data); ctx = bpf_ctx_init(kattr, sizeof(struct __sk_buff)); if (IS_ERR(ctx)) { kfree(data); return PTR_ERR(ctx); } switch (prog->type) { case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: is_l2 = true; fallthrough; case BPF_PROG_TYPE_LWT_IN: case BPF_PROG_TYPE_LWT_OUT: case BPF_PROG_TYPE_LWT_XMIT: is_direct_pkt_access = true; break; default: break; } sk = sk_alloc(net, AF_UNSPEC, GFP_USER, &bpf_dummy_proto, 1); if (!sk) { kfree(data); kfree(ctx); return -ENOMEM; } sock_init_data(NULL, sk); skb = slab_build_skb(data); if (!skb) { kfree(data); kfree(ctx); sk_free(sk); return -ENOMEM; } skb->sk = sk; skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); __skb_put(skb, size); if (ctx && ctx->ifindex > 1) { dev = dev_get_by_index(net, ctx->ifindex); if (!dev) { ret = -ENODEV; goto out; } } skb->protocol = eth_type_trans(skb, dev); skb_reset_network_header(skb); switch (skb->protocol) { case htons(ETH_P_IP): sk->sk_family = AF_INET; if (sizeof(struct iphdr) <= skb_headlen(skb)) { sk->sk_rcv_saddr = ip_hdr(skb)->saddr; sk->sk_daddr = ip_hdr(skb)->daddr; } break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): sk->sk_family = AF_INET6; if (sizeof(struct ipv6hdr) <= skb_headlen(skb)) { sk->sk_v6_rcv_saddr = ipv6_hdr(skb)->saddr; sk->sk_v6_daddr = ipv6_hdr(skb)->daddr; } break; #endif default: break; } if (is_l2) __skb_push(skb, hh_len); if (is_direct_pkt_access) bpf_compute_data_pointers(skb); ret = convert___skb_to_skb(skb, ctx); if (ret) goto out; ret = bpf_test_run(prog, skb, repeat, &retval, &duration, false); if (ret) goto out; if (!is_l2) { if (skb_headroom(skb) < hh_len) { int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); if (pskb_expand_head(skb, nhead, 0, GFP_USER)) { ret = -ENOMEM; goto out; } } memset(__skb_push(skb, hh_len), 0, hh_len); } convert_skb_to___skb(skb, ctx); size = skb->len; /* bpf program can never convert linear skb to non-linear */ if (WARN_ON_ONCE(skb_is_nonlinear(skb))) size = skb_headlen(skb); ret = bpf_test_finish(kattr, uattr, skb->data, NULL, size, retval, duration); if (!ret) ret = bpf_ctx_finish(kattr, uattr, ctx, sizeof(struct __sk_buff)); out: if (dev && dev != net->loopback_dev) dev_put(dev); kfree_skb(skb); sk_free(sk); kfree(ctx); return ret; } static int xdp_convert_md_to_buff(struct xdp_md *xdp_md, struct xdp_buff *xdp) { unsigned int ingress_ifindex, rx_queue_index; struct netdev_rx_queue *rxqueue; struct net_device *device; if (!xdp_md) return 0; if (xdp_md->egress_ifindex != 0) return -EINVAL; ingress_ifindex = xdp_md->ingress_ifindex; rx_queue_index = xdp_md->rx_queue_index; if (!ingress_ifindex && rx_queue_index) return -EINVAL; if (ingress_ifindex) { device = dev_get_by_index(current->nsproxy->net_ns, ingress_ifindex); if (!device) return -ENODEV; if (rx_queue_index >= device->real_num_rx_queues) goto free_dev; rxqueue = __netif_get_rx_queue(device, rx_queue_index); if (!xdp_rxq_info_is_reg(&rxqueue->xdp_rxq)) goto free_dev; xdp->rxq = &rxqueue->xdp_rxq; /* The device is now tracked in the xdp->rxq for later * dev_put() */ } xdp->data = xdp->data_meta + xdp_md->data; return 0; free_dev: dev_put(device); return -EINVAL; } static void xdp_convert_buff_to_md(struct xdp_buff *xdp, struct xdp_md *xdp_md) { if (!xdp_md) return; xdp_md->data = xdp->data - xdp->data_meta; xdp_md->data_end = xdp->data_end - xdp->data_meta; if (xdp_md->ingress_ifindex) dev_put(xdp->rxq->dev); } int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { bool do_live = (kattr->test.flags & BPF_F_TEST_XDP_LIVE_FRAMES); u32 tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); u32 batch_size = kattr->test.batch_size; u32 retval = 0, duration, max_data_sz; u32 size = kattr->test.data_size_in; u32 headroom = XDP_PACKET_HEADROOM; u32 repeat = kattr->test.repeat; struct netdev_rx_queue *rxqueue; struct skb_shared_info *sinfo; struct xdp_buff xdp = {}; int i, ret = -EINVAL; struct xdp_md *ctx; void *data; if (prog->expected_attach_type == BPF_XDP_DEVMAP || prog->expected_attach_type == BPF_XDP_CPUMAP) return -EINVAL; if (kattr->test.flags & ~BPF_F_TEST_XDP_LIVE_FRAMES) return -EINVAL; if (bpf_prog_is_dev_bound(prog->aux)) return -EINVAL; if (do_live) { if (!batch_size) batch_size = NAPI_POLL_WEIGHT; else if (batch_size > TEST_XDP_MAX_BATCH) return -E2BIG; headroom += sizeof(struct xdp_page_head); } else if (batch_size) { return -EINVAL; } ctx = bpf_ctx_init(kattr, sizeof(struct xdp_md)); if (IS_ERR(ctx)) return PTR_ERR(ctx); if (ctx) { /* There can't be user provided data before the meta data */ if (ctx->data_meta || ctx->data_end != size || ctx->data > ctx->data_end || unlikely(xdp_metalen_invalid(ctx->data)) || (do_live && (kattr->test.data_out || kattr->test.ctx_out))) goto free_ctx; /* Meta data is allocated from the headroom */ headroom -= ctx->data; } max_data_sz = 4096 - headroom - tailroom; if (size > max_data_sz) { /* disallow live data mode for jumbo frames */ if (do_live) goto free_ctx; size = max_data_sz; } data = bpf_test_init(kattr, size, max_data_sz, headroom, tailroom); if (IS_ERR(data)) { ret = PTR_ERR(data); goto free_ctx; } rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0); rxqueue->xdp_rxq.frag_size = headroom + max_data_sz + tailroom; xdp_init_buff(&xdp, rxqueue->xdp_rxq.frag_size, &rxqueue->xdp_rxq); xdp_prepare_buff(&xdp, data, headroom, size, true); sinfo = xdp_get_shared_info_from_buff(&xdp); ret = xdp_convert_md_to_buff(ctx, &xdp); if (ret) goto free_data; if (unlikely(kattr->test.data_size_in > size)) { void __user *data_in = u64_to_user_ptr(kattr->test.data_in); while (size < kattr->test.data_size_in) { struct page *page; skb_frag_t *frag; u32 data_len; if (sinfo->nr_frags == MAX_SKB_FRAGS) { ret = -ENOMEM; goto out; } page = alloc_page(GFP_KERNEL); if (!page) { ret = -ENOMEM; goto out; } frag = &sinfo->frags[sinfo->nr_frags++]; data_len = min_t(u32, kattr->test.data_size_in - size, PAGE_SIZE); skb_frag_fill_page_desc(frag, page, 0, data_len); if (copy_from_user(page_address(page), data_in + size, data_len)) { ret = -EFAULT; goto out; } sinfo->xdp_frags_size += data_len; size += data_len; } xdp_buff_set_frags_flag(&xdp); } if (repeat > 1) bpf_prog_change_xdp(NULL, prog); if (do_live) ret = bpf_test_run_xdp_live(prog, &xdp, repeat, batch_size, &duration); else ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true); /* We convert the xdp_buff back to an xdp_md before checking the return * code so the reference count of any held netdevice will be decremented * even if the test run failed. */ xdp_convert_buff_to_md(&xdp, ctx); if (ret) goto out; size = xdp.data_end - xdp.data_meta + sinfo->xdp_frags_size; ret = bpf_test_finish(kattr, uattr, xdp.data_meta, sinfo, size, retval, duration); if (!ret) ret = bpf_ctx_finish(kattr, uattr, ctx, sizeof(struct xdp_md)); out: if (repeat > 1) bpf_prog_change_xdp(prog, NULL); free_data: for (i = 0; i < sinfo->nr_frags; i++) __free_page(skb_frag_page(&sinfo->frags[i])); kfree(data); free_ctx: kfree(ctx); return ret; } static int verify_user_bpf_flow_keys(struct bpf_flow_keys *ctx) { /* make sure the fields we don't use are zeroed */ if (!range_is_zero(ctx, 0, offsetof(struct bpf_flow_keys, flags))) return -EINVAL; /* flags is allowed */ if (!range_is_zero(ctx, offsetofend(struct bpf_flow_keys, flags), sizeof(struct bpf_flow_keys))) return -EINVAL; return 0; } int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { struct bpf_test_timer t = { NO_PREEMPT }; u32 size = kattr->test.data_size_in; struct bpf_flow_dissector ctx = {}; u32 repeat = kattr->test.repeat; struct bpf_flow_keys *user_ctx; struct bpf_flow_keys flow_keys; const struct ethhdr *eth; unsigned int flags = 0; u32 retval, duration; void *data; int ret; if (kattr->test.flags || kattr->test.cpu || kattr->test.batch_size) return -EINVAL; if (size < ETH_HLEN) return -EINVAL; data = bpf_test_init(kattr, kattr->test.data_size_in, size, 0, 0); if (IS_ERR(data)) return PTR_ERR(data); eth = (struct ethhdr *)data; if (!repeat) repeat = 1; user_ctx = bpf_ctx_init(kattr, sizeof(struct bpf_flow_keys)); if (IS_ERR(user_ctx)) { kfree(data); return PTR_ERR(user_ctx); } if (user_ctx) { ret = verify_user_bpf_flow_keys(user_ctx); if (ret) goto out; flags = user_ctx->flags; } ctx.flow_keys = &flow_keys; ctx.data = data; ctx.data_end = (__u8 *)data + size; bpf_test_timer_enter(&t); do { retval = bpf_flow_dissect(prog, &ctx, eth->h_proto, ETH_HLEN, size, flags); } while (bpf_test_timer_continue(&t, 1, repeat, &ret, &duration)); bpf_test_timer_leave(&t); if (ret < 0) goto out; ret = bpf_test_finish(kattr, uattr, &flow_keys, NULL, sizeof(flow_keys), retval, duration); if (!ret) ret = bpf_ctx_finish(kattr, uattr, user_ctx, sizeof(struct bpf_flow_keys)); out: kfree(user_ctx); kfree(data); return ret; } int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { struct bpf_test_timer t = { NO_PREEMPT }; struct bpf_prog_array *progs = NULL; struct bpf_sk_lookup_kern ctx = {}; u32 repeat = kattr->test.repeat; struct bpf_sk_lookup *user_ctx; u32 retval, duration; int ret = -EINVAL; if (kattr->test.flags || kattr->test.cpu || kattr->test.batch_size) return -EINVAL; if (kattr->test.data_in || kattr->test.data_size_in || kattr->test.data_out || kattr->test.data_size_out) return -EINVAL; if (!repeat) repeat = 1; user_ctx = bpf_ctx_init(kattr, sizeof(*user_ctx)); if (IS_ERR(user_ctx)) return PTR_ERR(user_ctx); if (!user_ctx) return -EINVAL; if (user_ctx->sk) goto out; if (!range_is_zero(user_ctx, offsetofend(typeof(*user_ctx), local_port), sizeof(*user_ctx))) goto out; if (user_ctx->local_port > U16_MAX) { ret = -ERANGE; goto out; } ctx.family = (u16)user_ctx->family; ctx.protocol = (u16)user_ctx->protocol; ctx.dport = (u16)user_ctx->local_port; ctx.sport = user_ctx->remote_port; switch (ctx.family) { case AF_INET: ctx.v4.daddr = (__force __be32)user_ctx->local_ip4; ctx.v4.saddr = (__force __be32)user_ctx->remote_ip4; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: ctx.v6.daddr = (struct in6_addr *)user_ctx->local_ip6; ctx.v6.saddr = (struct in6_addr *)user_ctx->remote_ip6; break; #endif default: ret = -EAFNOSUPPORT; goto out; } progs = bpf_prog_array_alloc(1, GFP_KERNEL); if (!progs) { ret = -ENOMEM; goto out; } progs->items[0].prog = prog; bpf_test_timer_enter(&t); do { ctx.selected_sk = NULL; retval = BPF_PROG_SK_LOOKUP_RUN_ARRAY(progs, ctx, bpf_prog_run); } while (bpf_test_timer_continue(&t, 1, repeat, &ret, &duration)); bpf_test_timer_leave(&t); if (ret < 0) goto out; user_ctx->cookie = 0; if (ctx.selected_sk) { if (ctx.selected_sk->sk_reuseport && !ctx.no_reuseport) { ret = -EOPNOTSUPP; goto out; } user_ctx->cookie = sock_gen_cookie(ctx.selected_sk); } ret = bpf_test_finish(kattr, uattr, NULL, NULL, 0, retval, duration); if (!ret) ret = bpf_ctx_finish(kattr, uattr, user_ctx, sizeof(*user_ctx)); out: bpf_prog_array_free(progs); kfree(user_ctx); return ret; } int bpf_prog_test_run_syscall(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { void __user *ctx_in = u64_to_user_ptr(kattr->test.ctx_in); __u32 ctx_size_in = kattr->test.ctx_size_in; void *ctx = NULL; u32 retval; int err = 0; /* doesn't support data_in/out, ctx_out, duration, or repeat or flags */ if (kattr->test.data_in || kattr->test.data_out || kattr->test.ctx_out || kattr->test.duration || kattr->test.repeat || kattr->test.flags || kattr->test.batch_size) return -EINVAL; if (ctx_size_in < prog->aux->max_ctx_offset || ctx_size_in > U16_MAX) return -EINVAL; if (ctx_size_in) { ctx = memdup_user(ctx_in, ctx_size_in); if (IS_ERR(ctx)) return PTR_ERR(ctx); } rcu_read_lock_trace(); retval = bpf_prog_run_pin_on_cpu(prog, ctx); rcu_read_unlock_trace(); if (copy_to_user(&uattr->test.retval, &retval, sizeof(u32))) { err = -EFAULT; goto out; } if (ctx_size_in) if (copy_to_user(ctx_in, ctx, ctx_size_in)) err = -EFAULT; out: kfree(ctx); return err; } static int verify_and_copy_hook_state(struct nf_hook_state *state, const struct nf_hook_state *user, struct net_device *dev) { if (user->in || user->out) return -EINVAL; if (user->net || user->sk || user->okfn) return -EINVAL; switch (user->pf) { case NFPROTO_IPV4: case NFPROTO_IPV6: switch (state->hook) { case NF_INET_PRE_ROUTING: state->in = dev; break; case NF_INET_LOCAL_IN: state->in = dev; break; case NF_INET_FORWARD: state->in = dev; state->out = dev; break; case NF_INET_LOCAL_OUT: state->out = dev; break; case NF_INET_POST_ROUTING: state->out = dev; break; } break; default: return -EINVAL; } state->pf = user->pf; state->hook = user->hook; return 0; } static __be16 nfproto_eth(int nfproto) { switch (nfproto) { case NFPROTO_IPV4: return htons(ETH_P_IP); case NFPROTO_IPV6: break; } return htons(ETH_P_IPV6); } int bpf_prog_test_run_nf(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { struct net *net = current->nsproxy->net_ns; struct net_device *dev = net->loopback_dev; struct nf_hook_state *user_ctx, hook_state = { .pf = NFPROTO_IPV4, .hook = NF_INET_LOCAL_OUT, }; u32 size = kattr->test.data_size_in; u32 repeat = kattr->test.repeat; struct bpf_nf_ctx ctx = { .state = &hook_state, }; struct sk_buff *skb = NULL; u32 retval, duration; void *data; int ret; if (kattr->test.flags || kattr->test.cpu || kattr->test.batch_size) return -EINVAL; if (size < sizeof(struct iphdr)) return -EINVAL; data = bpf_test_init(kattr, kattr->test.data_size_in, size, NET_SKB_PAD + NET_IP_ALIGN, SKB_DATA_ALIGN(sizeof(struct skb_shared_info))); if (IS_ERR(data)) return PTR_ERR(data); if (!repeat) repeat = 1; user_ctx = bpf_ctx_init(kattr, sizeof(struct nf_hook_state)); if (IS_ERR(user_ctx)) { kfree(data); return PTR_ERR(user_ctx); } if (user_ctx) { ret = verify_and_copy_hook_state(&hook_state, user_ctx, dev); if (ret) goto out; } skb = slab_build_skb(data); if (!skb) { ret = -ENOMEM; goto out; } data = NULL; /* data released via kfree_skb */ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); __skb_put(skb, size); ret = -EINVAL; if (hook_state.hook != NF_INET_LOCAL_OUT) { if (size < ETH_HLEN + sizeof(struct iphdr)) goto out; skb->protocol = eth_type_trans(skb, dev); switch (skb->protocol) { case htons(ETH_P_IP): if (hook_state.pf == NFPROTO_IPV4) break; goto out; case htons(ETH_P_IPV6): if (size < ETH_HLEN + sizeof(struct ipv6hdr)) goto out; if (hook_state.pf == NFPROTO_IPV6) break; goto out; default: ret = -EPROTO; goto out; } skb_reset_network_header(skb); } else { skb->protocol = nfproto_eth(hook_state.pf); } ctx.skb = skb; ret = bpf_test_run(prog, &ctx, repeat, &retval, &duration, false); if (ret) goto out; ret = bpf_test_finish(kattr, uattr, NULL, NULL, 0, retval, duration); out: kfree(user_ctx); kfree_skb(skb); kfree(data); return ret; } static const struct btf_kfunc_id_set bpf_prog_test_kfunc_set = { .owner = THIS_MODULE, .set = &test_sk_check_kfunc_ids, }; BTF_ID_LIST(bpf_prog_test_dtor_kfunc_ids) BTF_ID(struct, prog_test_ref_kfunc) BTF_ID(func, bpf_kfunc_call_test_release) BTF_ID(struct, prog_test_member) BTF_ID(func, bpf_kfunc_call_memb_release) static int __init bpf_prog_test_run_init(void) { const struct btf_id_dtor_kfunc bpf_prog_test_dtor_kfunc[] = { { .btf_id = bpf_prog_test_dtor_kfunc_ids[0], .kfunc_btf_id = bpf_prog_test_dtor_kfunc_ids[1] }, { .btf_id = bpf_prog_test_dtor_kfunc_ids[2], .kfunc_btf_id = bpf_prog_test_dtor_kfunc_ids[3], }, }; int ret; ret = register_btf_fmodret_id_set(&bpf_test_modify_return_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_prog_test_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_prog_test_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &bpf_prog_test_kfunc_set); return ret ?: register_btf_id_dtor_kfuncs(bpf_prog_test_dtor_kfunc, ARRAY_SIZE(bpf_prog_test_dtor_kfunc), THIS_MODULE); } late_initcall(bpf_prog_test_run_init); |
1617 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2007-2012 Nicira, Inc. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/if_arp.h> #include <linux/if_bridge.h> #include <linux/if_vlan.h> #include <linux/kernel.h> #include <linux/llc.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <linux/openvswitch.h> #include <linux/export.h> #include <net/ip_tunnels.h> #include <net/rtnetlink.h> #include "datapath.h" #include "vport.h" #include "vport-internal_dev.h" #include "vport-netdev.h" static struct vport_ops ovs_netdev_vport_ops; /* Must be called with rcu_read_lock. */ static void netdev_port_receive(struct sk_buff *skb) { struct vport *vport; vport = ovs_netdev_get_vport(skb->dev); if (unlikely(!vport)) goto error; if (unlikely(skb_warn_if_lro(skb))) goto error; /* Make our own copy of the packet. Otherwise we will mangle the * packet for anyone who came before us (e.g. tcpdump via AF_PACKET). */ skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) return; if (skb->dev->type == ARPHRD_ETHER) skb_push_rcsum(skb, ETH_HLEN); ovs_vport_receive(vport, skb, skb_tunnel_info(skb)); return; error: kfree_skb(skb); } /* Called with rcu_read_lock and bottom-halves disabled. */ static rx_handler_result_t netdev_frame_hook(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) return RX_HANDLER_PASS; netdev_port_receive(skb); return RX_HANDLER_CONSUMED; } static struct net_device *get_dpdev(const struct datapath *dp) { struct vport *local; local = ovs_vport_ovsl(dp, OVSP_LOCAL); return local->dev; } struct vport *ovs_netdev_link(struct vport *vport, const char *name) { int err; vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), name); if (!vport->dev) { err = -ENODEV; goto error_free_vport; } netdev_tracker_alloc(vport->dev, &vport->dev_tracker, GFP_KERNEL); if (vport->dev->flags & IFF_LOOPBACK || (vport->dev->type != ARPHRD_ETHER && vport->dev->type != ARPHRD_NONE) || ovs_is_internal_dev(vport->dev)) { err = -EINVAL; goto error_put; } rtnl_lock(); err = netdev_master_upper_dev_link(vport->dev, get_dpdev(vport->dp), NULL, NULL, NULL); if (err) goto error_unlock; err = netdev_rx_handler_register(vport->dev, netdev_frame_hook, vport); if (err) goto error_master_upper_dev_unlink; dev_disable_lro(vport->dev); dev_set_promiscuity(vport->dev, 1); vport->dev->priv_flags |= IFF_OVS_DATAPATH; rtnl_unlock(); return vport; error_master_upper_dev_unlink: netdev_upper_dev_unlink(vport->dev, get_dpdev(vport->dp)); error_unlock: rtnl_unlock(); error_put: netdev_put(vport->dev, &vport->dev_tracker); error_free_vport: ovs_vport_free(vport); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(ovs_netdev_link); static struct vport *netdev_create(const struct vport_parms *parms) { struct vport *vport; vport = ovs_vport_alloc(0, &ovs_netdev_vport_ops, parms); if (IS_ERR(vport)) return vport; return ovs_netdev_link(vport, parms->name); } static void vport_netdev_free(struct rcu_head *rcu) { struct vport *vport = container_of(rcu, struct vport, rcu); netdev_put(vport->dev, &vport->dev_tracker); ovs_vport_free(vport); } void ovs_netdev_detach_dev(struct vport *vport) { ASSERT_RTNL(); vport->dev->priv_flags &= ~IFF_OVS_DATAPATH; netdev_rx_handler_unregister(vport->dev); netdev_upper_dev_unlink(vport->dev, netdev_master_upper_dev_get(vport->dev)); dev_set_promiscuity(vport->dev, -1); } static void netdev_destroy(struct vport *vport) { rtnl_lock(); if (netif_is_ovs_port(vport->dev)) ovs_netdev_detach_dev(vport); rtnl_unlock(); call_rcu(&vport->rcu, vport_netdev_free); } void ovs_netdev_tunnel_destroy(struct vport *vport) { rtnl_lock(); if (netif_is_ovs_port(vport->dev)) ovs_netdev_detach_dev(vport); /* We can be invoked by both explicit vport deletion and * underlying netdev deregistration; delete the link only * if it's not already shutting down. */ if (vport->dev->reg_state == NETREG_REGISTERED) rtnl_delete_link(vport->dev, 0, NULL); netdev_put(vport->dev, &vport->dev_tracker); vport->dev = NULL; rtnl_unlock(); call_rcu(&vport->rcu, vport_netdev_free); } EXPORT_SYMBOL_GPL(ovs_netdev_tunnel_destroy); /* Returns null if this device is not attached to a datapath. */ struct vport *ovs_netdev_get_vport(struct net_device *dev) { if (likely(netif_is_ovs_port(dev))) return (struct vport *) rcu_dereference_rtnl(dev->rx_handler_data); else return NULL; } static struct vport_ops ovs_netdev_vport_ops = { .type = OVS_VPORT_TYPE_NETDEV, .create = netdev_create, .destroy = netdev_destroy, .send = dev_queue_xmit, }; int __init ovs_netdev_init(void) { return ovs_vport_ops_register(&ovs_netdev_vport_ops); } void ovs_netdev_exit(void) { ovs_vport_ops_unregister(&ovs_netdev_vport_ops); } |
27 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Intel SMP support routines. * * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk> * (c) 1998-99, 2000, 2009 Ingo Molnar <mingo@redhat.com> * (c) 2002,2003 Andi Kleen, SuSE Labs. * * i386 and x86_64 integration by Glauber Costa <gcosta@redhat.com> */ #include <linux/init.h> #include <linux/mm.h> #include <linux/delay.h> #include <linux/spinlock.h> #include <linux/export.h> #include <linux/kernel_stat.h> #include <linux/mc146818rtc.h> #include <linux/cache.h> #include <linux/interrupt.h> #include <linux/cpu.h> #include <linux/gfp.h> #include <linux/kexec.h> #include <asm/mtrr.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> #include <asm/proto.h> #include <asm/apic.h> #include <asm/cpu.h> #include <asm/idtentry.h> #include <asm/nmi.h> #include <asm/mce.h> #include <asm/trace/irq_vectors.h> #include <asm/kexec.h> #include <asm/reboot.h> /* * Some notes on x86 processor bugs affecting SMP operation: * * Pentium, Pentium Pro, II, III (and all CPUs) have bugs. * The Linux implications for SMP are handled as follows: * * Pentium III / [Xeon] * None of the E1AP-E3AP errata are visible to the user. * * E1AP. see PII A1AP * E2AP. see PII A2AP * E3AP. see PII A3AP * * Pentium II / [Xeon] * None of the A1AP-A3AP errata are visible to the user. * * A1AP. see PPro 1AP * A2AP. see PPro 2AP * A3AP. see PPro 7AP * * Pentium Pro * None of 1AP-9AP errata are visible to the normal user, * except occasional delivery of 'spurious interrupt' as trap #15. * This is very rare and a non-problem. * * 1AP. Linux maps APIC as non-cacheable * 2AP. worked around in hardware * 3AP. fixed in C0 and above steppings microcode update. * Linux does not use excessive STARTUP_IPIs. * 4AP. worked around in hardware * 5AP. symmetric IO mode (normal Linux operation) not affected. * 'noapic' mode has vector 0xf filled out properly. * 6AP. 'noapic' mode might be affected - fixed in later steppings * 7AP. We do not assume writes to the LVT deasserting IRQs * 8AP. We do not enable low power mode (deep sleep) during MP bootup * 9AP. We do not use mixed mode * * Pentium * There is a marginal case where REP MOVS on 100MHz SMP * machines with B stepping processors can fail. XXX should provide * an L1cache=Writethrough or L1cache=off option. * * B stepping CPUs may hang. There are hardware work arounds * for this. We warn about it in case your board doesn't have the work * arounds. Basically that's so I can tell anyone with a B stepping * CPU and SMP problems "tough". * * Specific items [From Pentium Processor Specification Update] * * 1AP. Linux doesn't use remote read * 2AP. Linux doesn't trust APIC errors * 3AP. We work around this * 4AP. Linux never generated 3 interrupts of the same priority * to cause a lost local interrupt. * 5AP. Remote read is never used * 6AP. not affected - worked around in hardware * 7AP. not affected - worked around in hardware * 8AP. worked around in hardware - we get explicit CS errors if not * 9AP. only 'noapic' mode affected. Might generate spurious * interrupts, we log only the first one and count the * rest silently. * 10AP. not affected - worked around in hardware * 11AP. Linux reads the APIC between writes to avoid this, as per * the documentation. Make sure you preserve this as it affects * the C stepping chips too. * 12AP. not affected - worked around in hardware * 13AP. not affected - worked around in hardware * 14AP. we always deassert INIT during bootup * 15AP. not affected - worked around in hardware * 16AP. not affected - worked around in hardware * 17AP. not affected - worked around in hardware * 18AP. not affected - worked around in hardware * 19AP. not affected - worked around in BIOS * * If this sounds worrying believe me these bugs are either ___RARE___, * or are signal timing bugs worked around in hardware and there's * about nothing of note with C stepping upwards. */ static atomic_t stopping_cpu = ATOMIC_INIT(-1); static bool smp_no_nmi_ipi = false; static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) { /* We are registered on stopping cpu too, avoid spurious NMI */ if (raw_smp_processor_id() == atomic_read(&stopping_cpu)) return NMI_HANDLED; cpu_emergency_disable_virtualization(); stop_this_cpu(NULL); return NMI_HANDLED; } /* * this function calls the 'stop' function on all other CPUs in the system. */ DEFINE_IDTENTRY_SYSVEC(sysvec_reboot) { apic_eoi(); cpu_emergency_disable_virtualization(); stop_this_cpu(NULL); } static int register_stop_handler(void) { return register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback, NMI_FLAG_FIRST, "smp_stop"); } static void native_stop_other_cpus(int wait) { unsigned int cpu = smp_processor_id(); unsigned long flags, timeout; if (reboot_force) return; /* Only proceed if this is the first CPU to reach this code */ if (atomic_cmpxchg(&stopping_cpu, -1, cpu) != -1) return; /* For kexec, ensure that offline CPUs are out of MWAIT and in HLT */ if (kexec_in_progress) smp_kick_mwait_play_dead(); /* * 1) Send an IPI on the reboot vector to all other CPUs. * * The other CPUs should react on it after leaving critical * sections and re-enabling interrupts. They might still hold * locks, but there is nothing which can be done about that. * * 2) Wait for all other CPUs to report that they reached the * HLT loop in stop_this_cpu() * * 3) If #2 timed out send an NMI to the CPUs which did not * yet report * * 4) Wait for all other CPUs to report that they reached the * HLT loop in stop_this_cpu() * * #3 can obviously race against a CPU reaching the HLT loop late. * That CPU will have reported already and the "have all CPUs * reached HLT" condition will be true despite the fact that the * other CPU is still handling the NMI. Again, there is no * protection against that as "disabled" APICs still respond to * NMIs. */ cpumask_copy(&cpus_stop_mask, cpu_online_mask); cpumask_clear_cpu(cpu, &cpus_stop_mask); if (!cpumask_empty(&cpus_stop_mask)) { apic_send_IPI_allbutself(REBOOT_VECTOR); /* * Don't wait longer than a second for IPI completion. The * wait request is not checked here because that would * prevent an NMI shutdown attempt in case that not all * CPUs reach shutdown state. */ timeout = USEC_PER_SEC; while (!cpumask_empty(&cpus_stop_mask) && timeout--) udelay(1); } /* if the REBOOT_VECTOR didn't work, try with the NMI */ if (!cpumask_empty(&cpus_stop_mask)) { /* * If NMI IPI is enabled, try to register the stop handler * and send the IPI. In any case try to wait for the other * CPUs to stop. */ if (!smp_no_nmi_ipi && !register_stop_handler()) { pr_emerg("Shutting down cpus with NMI\n"); for_each_cpu(cpu, &cpus_stop_mask) __apic_send_IPI(cpu, NMI_VECTOR); } /* * Don't wait longer than 10 ms if the caller didn't * request it. If wait is true, the machine hangs here if * one or more CPUs do not reach shutdown state. */ timeout = USEC_PER_MSEC * 10; while (!cpumask_empty(&cpus_stop_mask) && (wait || timeout--)) udelay(1); } local_irq_save(flags); disable_local_APIC(); mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); local_irq_restore(flags); /* * Ensure that the cpus_stop_mask cache lines are invalidated on * the other CPUs. See comment vs. SME in stop_this_cpu(). */ cpumask_clear(&cpus_stop_mask); } /* * Reschedule call back. KVM uses this interrupt to force a cpu out of * guest mode. */ DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_reschedule_ipi) { apic_eoi(); trace_reschedule_entry(RESCHEDULE_VECTOR); inc_irq_stat(irq_resched_count); scheduler_ipi(); trace_reschedule_exit(RESCHEDULE_VECTOR); } DEFINE_IDTENTRY_SYSVEC(sysvec_call_function) { apic_eoi(); trace_call_function_entry(CALL_FUNCTION_VECTOR); inc_irq_stat(irq_call_count); generic_smp_call_function_interrupt(); trace_call_function_exit(CALL_FUNCTION_VECTOR); } DEFINE_IDTENTRY_SYSVEC(sysvec_call_function_single) { apic_eoi(); trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR); inc_irq_stat(irq_call_count); generic_smp_call_function_single_interrupt(); trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR); } static int __init nonmi_ipi_setup(char *str) { smp_no_nmi_ipi = true; return 1; } __setup("nonmi_ipi", nonmi_ipi_setup); struct smp_ops smp_ops = { .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, .smp_prepare_cpus = native_smp_prepare_cpus, .smp_cpus_done = native_smp_cpus_done, .stop_other_cpus = native_stop_other_cpus, #if defined(CONFIG_KEXEC_CORE) .crash_stop_other_cpus = kdump_nmi_shootdown_cpus, #endif .smp_send_reschedule = native_smp_send_reschedule, .kick_ap_alive = native_kick_ap, .cpu_disable = native_cpu_disable, .play_dead = native_play_dead, .send_call_func_ipi = native_send_call_func_ipi, .send_call_func_single_ipi = native_send_call_func_single_ipi, }; EXPORT_SYMBOL_GPL(smp_ops); |
13 13 9 1612 179 1612 109 1621 63 18 30 13 4 1620 19 18 5 13 5 5 4 18 19 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 | // SPDX-License-Identifier: GPL-2.0-or-later /* * INET 802.1Q VLAN * Ethernet-type device handling. * * Authors: Ben Greear <greearb@candelatech.com> * Please send support related email to: netdev@vger.kernel.org * VLAN Home Page: http://www.candelatech.com/~greear/vlan.html * * Fixes: * Fix for packet capture - Nick Eggleston <nick@dccinc.com>; * Add HW acceleration hooks - David S. Miller <davem@redhat.com>; * Correct all the locking - David S. Miller <davem@redhat.com>; * Use hash table for VLAN groups - David S. Miller <davem@redhat.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/capability.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/rculist.h> #include <net/p8022.h> #include <net/arp.h> #include <linux/rtnetlink.h> #include <linux/notifier.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <linux/uaccess.h> #include <linux/if_vlan.h> #include "vlan.h" #include "vlanproc.h" #define DRV_VERSION "1.8" /* Global VLAN variables */ unsigned int vlan_net_id __read_mostly; const char vlan_fullname[] = "802.1Q VLAN Support"; const char vlan_version[] = DRV_VERSION; /* End of global variables definitions. */ static int vlan_group_prealloc_vid(struct vlan_group *vg, __be16 vlan_proto, u16 vlan_id) { struct net_device **array; unsigned int vidx; unsigned int size; int pidx; ASSERT_RTNL(); pidx = vlan_proto_idx(vlan_proto); if (pidx < 0) return -EINVAL; vidx = vlan_id / VLAN_GROUP_ARRAY_PART_LEN; array = vg->vlan_devices_arrays[pidx][vidx]; if (array != NULL) return 0; size = sizeof(struct net_device *) * VLAN_GROUP_ARRAY_PART_LEN; array = kzalloc(size, GFP_KERNEL_ACCOUNT); if (array == NULL) return -ENOBUFS; /* paired with smp_rmb() in __vlan_group_get_device() */ smp_wmb(); vg->vlan_devices_arrays[pidx][vidx] = array; return 0; } static void vlan_stacked_transfer_operstate(const struct net_device *rootdev, struct net_device *dev, struct vlan_dev_priv *vlan) { if (!(vlan->flags & VLAN_FLAG_BRIDGE_BINDING)) netif_stacked_transfer_operstate(rootdev, dev); } void unregister_vlan_dev(struct net_device *dev, struct list_head *head) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; struct vlan_info *vlan_info; struct vlan_group *grp; u16 vlan_id = vlan->vlan_id; ASSERT_RTNL(); vlan_info = rtnl_dereference(real_dev->vlan_info); BUG_ON(!vlan_info); grp = &vlan_info->grp; grp->nr_vlan_devs--; if (vlan->flags & VLAN_FLAG_MVRP) vlan_mvrp_request_leave(dev); if (vlan->flags & VLAN_FLAG_GVRP) vlan_gvrp_request_leave(dev); vlan_group_set_device(grp, vlan->vlan_proto, vlan_id, NULL); netdev_upper_dev_unlink(real_dev, dev); /* Because unregister_netdevice_queue() makes sure at least one rcu * grace period is respected before device freeing, * we dont need to call synchronize_net() here. */ unregister_netdevice_queue(dev, head); if (grp->nr_vlan_devs == 0) { vlan_mvrp_uninit_applicant(real_dev); vlan_gvrp_uninit_applicant(real_dev); } vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id); } int vlan_check_real_dev(struct net_device *real_dev, __be16 protocol, u16 vlan_id, struct netlink_ext_ack *extack) { const char *name = real_dev->name; if (real_dev->features & NETIF_F_VLAN_CHALLENGED) { pr_info("VLANs not supported on %s\n", name); NL_SET_ERR_MSG_MOD(extack, "VLANs not supported on device"); return -EOPNOTSUPP; } if (vlan_find_dev(real_dev, protocol, vlan_id) != NULL) { NL_SET_ERR_MSG_MOD(extack, "VLAN device already exists"); return -EEXIST; } return 0; } int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; u16 vlan_id = vlan->vlan_id; struct vlan_info *vlan_info; struct vlan_group *grp; int err; err = vlan_vid_add(real_dev, vlan->vlan_proto, vlan_id); if (err) return err; vlan_info = rtnl_dereference(real_dev->vlan_info); /* vlan_info should be there now. vlan_vid_add took care of it */ BUG_ON(!vlan_info); grp = &vlan_info->grp; if (grp->nr_vlan_devs == 0) { err = vlan_gvrp_init_applicant(real_dev); if (err < 0) goto out_vid_del; err = vlan_mvrp_init_applicant(real_dev); if (err < 0) goto out_uninit_gvrp; } err = vlan_group_prealloc_vid(grp, vlan->vlan_proto, vlan_id); if (err < 0) goto out_uninit_mvrp; err = register_netdevice(dev); if (err < 0) goto out_uninit_mvrp; err = netdev_upper_dev_link(real_dev, dev, extack); if (err) goto out_unregister_netdev; vlan_stacked_transfer_operstate(real_dev, dev, vlan); linkwatch_fire_event(dev); /* _MUST_ call rfc2863_policy() */ /* So, got the sucker initialized, now lets place * it into our local structure. */ vlan_group_set_device(grp, vlan->vlan_proto, vlan_id, dev); grp->nr_vlan_devs++; return 0; out_unregister_netdev: unregister_netdevice(dev); out_uninit_mvrp: if (grp->nr_vlan_devs == 0) vlan_mvrp_uninit_applicant(real_dev); out_uninit_gvrp: if (grp->nr_vlan_devs == 0) vlan_gvrp_uninit_applicant(real_dev); out_vid_del: vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id); return err; } /* Attach a VLAN device to a mac address (ie Ethernet Card). * Returns 0 if the device was created or a negative error code otherwise. */ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id) { struct net_device *new_dev; struct vlan_dev_priv *vlan; struct net *net = dev_net(real_dev); struct vlan_net *vn = net_generic(net, vlan_net_id); char name[IFNAMSIZ]; int err; if (vlan_id >= VLAN_VID_MASK) return -ERANGE; err = vlan_check_real_dev(real_dev, htons(ETH_P_8021Q), vlan_id, NULL); if (err < 0) return err; /* Gotta set up the fields for the device. */ switch (vn->name_type) { case VLAN_NAME_TYPE_RAW_PLUS_VID: /* name will look like: eth1.0005 */ snprintf(name, IFNAMSIZ, "%s.%.4i", real_dev->name, vlan_id); break; case VLAN_NAME_TYPE_PLUS_VID_NO_PAD: /* Put our vlan.VID in the name. * Name will look like: vlan5 */ snprintf(name, IFNAMSIZ, "vlan%i", vlan_id); break; case VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD: /* Put our vlan.VID in the name. * Name will look like: eth0.5 */ snprintf(name, IFNAMSIZ, "%s.%i", real_dev->name, vlan_id); break; case VLAN_NAME_TYPE_PLUS_VID: /* Put our vlan.VID in the name. * Name will look like: vlan0005 */ default: snprintf(name, IFNAMSIZ, "vlan%.4i", vlan_id); } new_dev = alloc_netdev(sizeof(struct vlan_dev_priv), name, NET_NAME_UNKNOWN, vlan_setup); if (new_dev == NULL) return -ENOBUFS; dev_net_set(new_dev, net); /* need 4 bytes for extra VLAN header info, * hope the underlying device can handle it. */ new_dev->mtu = real_dev->mtu; vlan = vlan_dev_priv(new_dev); vlan->vlan_proto = htons(ETH_P_8021Q); vlan->vlan_id = vlan_id; vlan->real_dev = real_dev; vlan->dent = NULL; vlan->flags = VLAN_FLAG_REORDER_HDR; new_dev->rtnl_link_ops = &vlan_link_ops; err = register_vlan_dev(new_dev, NULL); if (err < 0) goto out_free_newdev; return 0; out_free_newdev: free_netdev(new_dev); return err; } static void vlan_sync_address(struct net_device *dev, struct net_device *vlandev) { struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev); /* May be called without an actual change */ if (ether_addr_equal(vlan->real_dev_addr, dev->dev_addr)) return; /* vlan continues to inherit address of lower device */ if (vlan_dev_inherit_address(vlandev, dev)) goto out; /* vlan address was different from the old address and is equal to * the new address */ if (!ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) && ether_addr_equal(vlandev->dev_addr, dev->dev_addr)) dev_uc_del(dev, vlandev->dev_addr); /* vlan address was equal to the old address and is different from * the new address */ if (ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) && !ether_addr_equal(vlandev->dev_addr, dev->dev_addr)) dev_uc_add(dev, vlandev->dev_addr); out: ether_addr_copy(vlan->real_dev_addr, dev->dev_addr); } static void vlan_transfer_features(struct net_device *dev, struct net_device *vlandev) { struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev); netif_inherit_tso_max(vlandev, dev); if (vlan_hw_offload_capable(dev->features, vlan->vlan_proto)) vlandev->hard_header_len = dev->hard_header_len; else vlandev->hard_header_len = dev->hard_header_len + VLAN_HLEN; #if IS_ENABLED(CONFIG_FCOE) vlandev->fcoe_ddp_xid = dev->fcoe_ddp_xid; #endif vlandev->priv_flags &= ~IFF_XMIT_DST_RELEASE; vlandev->priv_flags |= (vlan->real_dev->priv_flags & IFF_XMIT_DST_RELEASE); vlandev->hw_enc_features = vlan_tnl_features(vlan->real_dev); netdev_update_features(vlandev); } static int __vlan_device_event(struct net_device *dev, unsigned long event) { int err = 0; switch (event) { case NETDEV_CHANGENAME: vlan_proc_rem_dev(dev); err = vlan_proc_add_dev(dev); break; case NETDEV_REGISTER: err = vlan_proc_add_dev(dev); break; case NETDEV_UNREGISTER: vlan_proc_rem_dev(dev); break; } return err; } static int vlan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr); struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct vlan_group *grp; struct vlan_info *vlan_info; int i, flgs; struct net_device *vlandev; struct vlan_dev_priv *vlan; bool last = false; LIST_HEAD(list); int err; if (is_vlan_dev(dev)) { int err = __vlan_device_event(dev, event); if (err) return notifier_from_errno(err); } if ((event == NETDEV_UP) && (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)) { pr_info("adding VLAN 0 to HW filter on device %s\n", dev->name); vlan_vid_add(dev, htons(ETH_P_8021Q), 0); } if (event == NETDEV_DOWN && (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)) vlan_vid_del(dev, htons(ETH_P_8021Q), 0); vlan_info = rtnl_dereference(dev->vlan_info); if (!vlan_info) goto out; grp = &vlan_info->grp; /* It is OK that we do not hold the group lock right now, * as we run under the RTNL lock. */ switch (event) { case NETDEV_CHANGE: /* Propagate real device state to vlan devices */ vlan_group_for_each_dev(grp, i, vlandev) vlan_stacked_transfer_operstate(dev, vlandev, vlan_dev_priv(vlandev)); break; case NETDEV_CHANGEADDR: /* Adjust unicast filters on underlying device */ vlan_group_for_each_dev(grp, i, vlandev) { flgs = vlandev->flags; if (!(flgs & IFF_UP)) continue; vlan_sync_address(dev, vlandev); } break; case NETDEV_CHANGEMTU: vlan_group_for_each_dev(grp, i, vlandev) { if (vlandev->mtu <= dev->mtu) continue; dev_set_mtu(vlandev, dev->mtu); } break; case NETDEV_FEAT_CHANGE: /* Propagate device features to underlying device */ vlan_group_for_each_dev(grp, i, vlandev) vlan_transfer_features(dev, vlandev); break; case NETDEV_DOWN: { struct net_device *tmp; LIST_HEAD(close_list); /* Put all VLANs for this dev in the down state too. */ vlan_group_for_each_dev(grp, i, vlandev) { flgs = vlandev->flags; if (!(flgs & IFF_UP)) continue; vlan = vlan_dev_priv(vlandev); if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING)) list_add(&vlandev->close_list, &close_list); } dev_close_many(&close_list, false); list_for_each_entry_safe(vlandev, tmp, &close_list, close_list) { vlan_stacked_transfer_operstate(dev, vlandev, vlan_dev_priv(vlandev)); list_del_init(&vlandev->close_list); } list_del(&close_list); break; } case NETDEV_UP: /* Put all VLANs for this dev in the up state too. */ vlan_group_for_each_dev(grp, i, vlandev) { flgs = dev_get_flags(vlandev); if (flgs & IFF_UP) continue; vlan = vlan_dev_priv(vlandev); if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING)) dev_change_flags(vlandev, flgs | IFF_UP, extack); vlan_stacked_transfer_operstate(dev, vlandev, vlan); } break; case NETDEV_UNREGISTER: /* twiddle thumbs on netns device moves */ if (dev->reg_state != NETREG_UNREGISTERING) break; vlan_group_for_each_dev(grp, i, vlandev) { /* removal of last vid destroys vlan_info, abort * afterwards */ if (vlan_info->nr_vids == 1) last = true; unregister_vlan_dev(vlandev, &list); if (last) break; } unregister_netdevice_many(&list); break; case NETDEV_PRE_TYPE_CHANGE: /* Forbid underlaying device to change its type. */ if (vlan_uses_dev(dev)) return NOTIFY_BAD; break; case NETDEV_NOTIFY_PEERS: case NETDEV_BONDING_FAILOVER: case NETDEV_RESEND_IGMP: /* Propagate to vlan devices */ vlan_group_for_each_dev(grp, i, vlandev) call_netdevice_notifiers(event, vlandev); break; case NETDEV_CVLAN_FILTER_PUSH_INFO: err = vlan_filter_push_vids(vlan_info, htons(ETH_P_8021Q)); if (err) return notifier_from_errno(err); break; case NETDEV_CVLAN_FILTER_DROP_INFO: vlan_filter_drop_vids(vlan_info, htons(ETH_P_8021Q)); break; case NETDEV_SVLAN_FILTER_PUSH_INFO: err = vlan_filter_push_vids(vlan_info, htons(ETH_P_8021AD)); if (err) return notifier_from_errno(err); break; case NETDEV_SVLAN_FILTER_DROP_INFO: vlan_filter_drop_vids(vlan_info, htons(ETH_P_8021AD)); break; } out: return NOTIFY_DONE; } static struct notifier_block vlan_notifier_block __read_mostly = { .notifier_call = vlan_device_event, }; /* * VLAN IOCTL handler. * o execute requested action or pass command to the device driver * arg is really a struct vlan_ioctl_args __user *. */ static int vlan_ioctl_handler(struct net *net, void __user *arg) { int err; struct vlan_ioctl_args args; struct net_device *dev = NULL; if (copy_from_user(&args, arg, sizeof(struct vlan_ioctl_args))) return -EFAULT; /* Null terminate this sucker, just in case. */ args.device1[sizeof(args.device1) - 1] = 0; args.u.device2[sizeof(args.u.device2) - 1] = 0; rtnl_lock(); switch (args.cmd) { case SET_VLAN_INGRESS_PRIORITY_CMD: case SET_VLAN_EGRESS_PRIORITY_CMD: case SET_VLAN_FLAG_CMD: case ADD_VLAN_CMD: case DEL_VLAN_CMD: case GET_VLAN_REALDEV_NAME_CMD: case GET_VLAN_VID_CMD: err = -ENODEV; dev = __dev_get_by_name(net, args.device1); if (!dev) goto out; err = -EINVAL; if (args.cmd != ADD_VLAN_CMD && !is_vlan_dev(dev)) goto out; } switch (args.cmd) { case SET_VLAN_INGRESS_PRIORITY_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; vlan_dev_set_ingress_priority(dev, args.u.skb_priority, args.vlan_qos); err = 0; break; case SET_VLAN_EGRESS_PRIORITY_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = vlan_dev_set_egress_priority(dev, args.u.skb_priority, args.vlan_qos); break; case SET_VLAN_FLAG_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = vlan_dev_change_flags(dev, args.vlan_qos ? args.u.flag : 0, args.u.flag); break; case SET_VLAN_NAME_TYPE_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; if (args.u.name_type < VLAN_NAME_TYPE_HIGHEST) { struct vlan_net *vn; vn = net_generic(net, vlan_net_id); vn->name_type = args.u.name_type; err = 0; } else { err = -EINVAL; } break; case ADD_VLAN_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = register_vlan_device(dev, args.u.VID); break; case DEL_VLAN_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; unregister_vlan_dev(dev, NULL); err = 0; break; case GET_VLAN_REALDEV_NAME_CMD: err = 0; vlan_dev_get_realdev_name(dev, args.u.device2, sizeof(args.u.device2)); if (copy_to_user(arg, &args, sizeof(struct vlan_ioctl_args))) err = -EFAULT; break; case GET_VLAN_VID_CMD: err = 0; args.u.VID = vlan_dev_vlan_id(dev); if (copy_to_user(arg, &args, sizeof(struct vlan_ioctl_args))) err = -EFAULT; break; default: err = -EOPNOTSUPP; break; } out: rtnl_unlock(); return err; } static int __net_init vlan_init_net(struct net *net) { struct vlan_net *vn = net_generic(net, vlan_net_id); int err; vn->name_type = VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD; err = vlan_proc_init(net); return err; } static void __net_exit vlan_exit_net(struct net *net) { vlan_proc_cleanup(net); } static struct pernet_operations vlan_net_ops = { .init = vlan_init_net, .exit = vlan_exit_net, .id = &vlan_net_id, .size = sizeof(struct vlan_net), }; static int __init vlan_proto_init(void) { int err; pr_info("%s v%s\n", vlan_fullname, vlan_version); err = register_pernet_subsys(&vlan_net_ops); if (err < 0) goto err0; err = register_netdevice_notifier(&vlan_notifier_block); if (err < 0) goto err2; err = vlan_gvrp_init(); if (err < 0) goto err3; err = vlan_mvrp_init(); if (err < 0) goto err4; err = vlan_netlink_init(); if (err < 0) goto err5; vlan_ioctl_set(vlan_ioctl_handler); return 0; err5: vlan_mvrp_uninit(); err4: vlan_gvrp_uninit(); err3: unregister_netdevice_notifier(&vlan_notifier_block); err2: unregister_pernet_subsys(&vlan_net_ops); err0: return err; } static void __exit vlan_cleanup_module(void) { vlan_ioctl_set(NULL); vlan_netlink_fini(); unregister_netdevice_notifier(&vlan_notifier_block); unregister_pernet_subsys(&vlan_net_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ vlan_mvrp_uninit(); vlan_gvrp_uninit(); } module_init(vlan_proto_init); module_exit(vlan_cleanup_module); MODULE_LICENSE("GPL"); MODULE_VERSION(DRV_VERSION); |
46 120 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BPF_CGROUP_H #define _BPF_CGROUP_H #include <linux/bpf.h> #include <linux/bpf-cgroup-defs.h> #include <linux/errno.h> #include <linux/jump_label.h> #include <linux/percpu.h> #include <linux/rbtree.h> #include <net/sock.h> #include <uapi/linux/bpf.h> struct sock; struct sockaddr; struct cgroup; struct sk_buff; struct bpf_map; struct bpf_prog; struct bpf_sock_ops_kern; struct bpf_cgroup_storage; struct ctl_table; struct ctl_table_header; struct task_struct; unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx, const struct bpf_insn *insn); unsigned int __cgroup_bpf_run_lsm_socket(const void *ctx, const struct bpf_insn *insn); unsigned int __cgroup_bpf_run_lsm_current(const void *ctx, const struct bpf_insn *insn); #ifdef CONFIG_CGROUP_BPF #define CGROUP_ATYPE(type) \ case BPF_##type: return type static inline enum cgroup_bpf_attach_type to_cgroup_bpf_attach_type(enum bpf_attach_type attach_type) { switch (attach_type) { CGROUP_ATYPE(CGROUP_INET_INGRESS); CGROUP_ATYPE(CGROUP_INET_EGRESS); CGROUP_ATYPE(CGROUP_INET_SOCK_CREATE); CGROUP_ATYPE(CGROUP_SOCK_OPS); CGROUP_ATYPE(CGROUP_DEVICE); CGROUP_ATYPE(CGROUP_INET4_BIND); CGROUP_ATYPE(CGROUP_INET6_BIND); CGROUP_ATYPE(CGROUP_INET4_CONNECT); CGROUP_ATYPE(CGROUP_INET6_CONNECT); CGROUP_ATYPE(CGROUP_INET4_POST_BIND); CGROUP_ATYPE(CGROUP_INET6_POST_BIND); CGROUP_ATYPE(CGROUP_UDP4_SENDMSG); CGROUP_ATYPE(CGROUP_UDP6_SENDMSG); CGROUP_ATYPE(CGROUP_SYSCTL); CGROUP_ATYPE(CGROUP_UDP4_RECVMSG); CGROUP_ATYPE(CGROUP_UDP6_RECVMSG); CGROUP_ATYPE(CGROUP_GETSOCKOPT); CGROUP_ATYPE(CGROUP_SETSOCKOPT); CGROUP_ATYPE(CGROUP_INET4_GETPEERNAME); CGROUP_ATYPE(CGROUP_INET6_GETPEERNAME); CGROUP_ATYPE(CGROUP_INET4_GETSOCKNAME); CGROUP_ATYPE(CGROUP_INET6_GETSOCKNAME); CGROUP_ATYPE(CGROUP_INET_SOCK_RELEASE); default: return CGROUP_BPF_ATTACH_TYPE_INVALID; } } #undef CGROUP_ATYPE extern struct static_key_false cgroup_bpf_enabled_key[MAX_CGROUP_BPF_ATTACH_TYPE]; #define cgroup_bpf_enabled(atype) static_branch_unlikely(&cgroup_bpf_enabled_key[atype]) #define for_each_cgroup_storage_type(stype) \ for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++) struct bpf_cgroup_storage_map; struct bpf_storage_buffer { struct rcu_head rcu; char data[]; }; struct bpf_cgroup_storage { union { struct bpf_storage_buffer *buf; void __percpu *percpu_buf; }; struct bpf_cgroup_storage_map *map; struct bpf_cgroup_storage_key key; struct list_head list_map; struct list_head list_cg; struct rb_node node; struct rcu_head rcu; }; struct bpf_cgroup_link { struct bpf_link link; struct cgroup *cgroup; enum bpf_attach_type type; }; struct bpf_prog_list { struct hlist_node node; struct bpf_prog *prog; struct bpf_cgroup_link *link; struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]; }; int cgroup_bpf_inherit(struct cgroup *cgrp); void cgroup_bpf_offline(struct cgroup *cgrp); int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, enum cgroup_bpf_attach_type atype); int __cgroup_bpf_run_filter_sk(struct sock *sk, enum cgroup_bpf_attach_type atype); int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, struct sockaddr *uaddr, enum cgroup_bpf_attach_type atype, void *t_ctx, u32 *flags); int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, struct bpf_sock_ops_kern *sock_ops, enum cgroup_bpf_attach_type atype); int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, short access, enum cgroup_bpf_attach_type atype); int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, char **buf, size_t *pcount, loff_t *ppos, enum cgroup_bpf_attach_type atype); int __cgroup_bpf_run_filter_setsockopt(struct sock *sock, int *level, int *optname, char __user *optval, int *optlen, char **kernel_optval); int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen, int max_optlen, int retval); int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, int optname, void *optval, int *optlen, int retval); static inline enum bpf_cgroup_storage_type cgroup_storage_type( struct bpf_map *map) { if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) return BPF_CGROUP_STORAGE_PERCPU; return BPF_CGROUP_STORAGE_SHARED; } struct bpf_cgroup_storage * cgroup_storage_lookup(struct bpf_cgroup_storage_map *map, void *key, bool locked); struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, enum bpf_cgroup_storage_type stype); void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage); void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, struct cgroup *cgroup, enum bpf_attach_type type); void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage); int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *map); int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, void *value, u64 flags); /* Opportunistic check to see whether we have any BPF program attached*/ static inline bool cgroup_bpf_sock_enabled(struct sock *sk, enum cgroup_bpf_attach_type type) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); struct bpf_prog_array *array; array = rcu_access_pointer(cgrp->bpf.effective[type]); return array != &bpf_empty_prog_array.hdr; } /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */ #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled(CGROUP_INET_INGRESS) && \ cgroup_bpf_sock_enabled(sk, CGROUP_INET_INGRESS)) \ __ret = __cgroup_bpf_run_filter_skb(sk, skb, \ CGROUP_INET_INGRESS); \ \ __ret; \ }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled(CGROUP_INET_EGRESS) && sk) { \ typeof(sk) __sk = sk_to_full_sk(sk); \ if (sk_fullsock(__sk) && __sk == skb_to_full_sk(skb) && \ cgroup_bpf_sock_enabled(__sk, CGROUP_INET_EGRESS)) \ __ret = __cgroup_bpf_run_filter_skb(__sk, skb, \ CGROUP_INET_EGRESS); \ } \ __ret; \ }) #define BPF_CGROUP_RUN_SK_PROG(sk, atype) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled(atype)) { \ __ret = __cgroup_bpf_run_filter_sk(sk, atype); \ } \ __ret; \ }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \ BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET_SOCK_CREATE) #define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) \ BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET_SOCK_RELEASE) #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) \ BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET4_POST_BIND) #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) \ BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET6_POST_BIND) #define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, atype) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled(atype)) \ __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype, \ NULL, NULL); \ __ret; \ }) #define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled(atype)) { \ lock_sock(sk); \ __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype, \ t_ctx, NULL); \ release_sock(sk); \ } \ __ret; \ }) /* BPF_CGROUP_INET4_BIND and BPF_CGROUP_INET6_BIND can return extra flags * via upper bits of return code. The only flag that is supported * (at bit position 0) is to indicate CAP_NET_BIND_SERVICE capability check * should be bypassed (BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE). */ #define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, atype, bind_flags) \ ({ \ u32 __flags = 0; \ int __ret = 0; \ if (cgroup_bpf_enabled(atype)) { \ lock_sock(sk); \ __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype, \ NULL, &__flags); \ release_sock(sk); \ if (__flags & BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE) \ *bind_flags |= BIND_NO_CAP_NET_BIND_SERVICE; \ } \ __ret; \ }) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) \ ((cgroup_bpf_enabled(CGROUP_INET4_CONNECT) || \ cgroup_bpf_enabled(CGROUP_INET6_CONNECT)) && \ (sk)->sk_prot->pre_connect) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) \ BPF_CGROUP_RUN_SA_PROG(sk, uaddr, CGROUP_INET4_CONNECT) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) \ BPF_CGROUP_RUN_SA_PROG(sk, uaddr, CGROUP_INET6_CONNECT) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_INET4_CONNECT, NULL) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_INET6_CONNECT, NULL) #define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP4_SENDMSG, t_ctx) #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP6_SENDMSG, t_ctx) #define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP4_RECVMSG, NULL) #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP6_RECVMSG, NULL) /* The SOCK_OPS"_SK" macro should be used when sock_ops->sk is not a * fullsock and its parent fullsock cannot be traced by * sk_to_full_sk(). * * e.g. sock_ops->sk is a request_sock and it is under syncookie mode. * Its listener-sk is not attached to the rsk_listener. * In this case, the caller holds the listener-sk (unlocked), * set its sock_ops->sk to req_sk, and call this SOCK_OPS"_SK" with * the listener-sk such that the cgroup-bpf-progs of the * listener-sk will be run. * * Regardless of syncookie mode or not, * calling bpf_setsockopt on listener-sk will not make sense anyway, * so passing 'sock_ops->sk == req_sk' to the bpf prog is appropriate here. */ #define BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(sock_ops, sk) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled(CGROUP_SOCK_OPS)) \ __ret = __cgroup_bpf_run_filter_sock_ops(sk, \ sock_ops, \ CGROUP_SOCK_OPS); \ __ret; \ }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled(CGROUP_SOCK_OPS) && (sock_ops)->sk) { \ typeof(sk) __sk = sk_to_full_sk((sock_ops)->sk); \ if (__sk && sk_fullsock(__sk)) \ __ret = __cgroup_bpf_run_filter_sock_ops(__sk, \ sock_ops, \ CGROUP_SOCK_OPS); \ } \ __ret; \ }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(atype, major, minor, access) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled(CGROUP_DEVICE)) \ __ret = __cgroup_bpf_check_dev_permission(atype, major, minor, \ access, \ CGROUP_DEVICE); \ \ __ret; \ }) #define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, pos) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled(CGROUP_SYSCTL)) \ __ret = __cgroup_bpf_run_filter_sysctl(head, table, write, \ buf, count, pos, \ CGROUP_SYSCTL); \ __ret; \ }) #define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \ kernel_optval) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled(CGROUP_SETSOCKOPT) && \ cgroup_bpf_sock_enabled(sock, CGROUP_SETSOCKOPT)) \ __ret = __cgroup_bpf_run_filter_setsockopt(sock, level, \ optname, optval, \ optlen, \ kernel_optval); \ __ret; \ }) #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT)) \ get_user(__ret, optlen); \ __ret; \ }) #define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, optlen, \ max_optlen, retval) \ ({ \ int __ret = retval; \ if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT) && \ cgroup_bpf_sock_enabled(sock, CGROUP_GETSOCKOPT)) \ if (!(sock)->sk_prot->bpf_bypass_getsockopt || \ !INDIRECT_CALL_INET_1((sock)->sk_prot->bpf_bypass_getsockopt, \ tcp_bpf_bypass_getsockopt, \ level, optname)) \ __ret = __cgroup_bpf_run_filter_getsockopt( \ sock, level, optname, optval, optlen, \ max_optlen, retval); \ __ret; \ }) #define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval, \ optlen, retval) \ ({ \ int __ret = retval; \ if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT)) \ __ret = __cgroup_bpf_run_filter_getsockopt_kern( \ sock, level, optname, optval, optlen, retval); \ __ret; \ }) int cgroup_bpf_prog_attach(const union bpf_attr *attr, enum bpf_prog_type ptype, struct bpf_prog *prog); int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int cgroup_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); const struct bpf_func_proto * cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); const struct bpf_func_proto * cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); #else static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } static inline void cgroup_bpf_offline(struct cgroup *cgrp) {} static inline int cgroup_bpf_prog_attach(const union bpf_attr *attr, enum bpf_prog_type ptype, struct bpf_prog *prog) { return -EINVAL; } static inline int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) { return -EINVAL; } static inline int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { return -EINVAL; } static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { return -EINVAL; } static inline const struct bpf_func_proto * cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { return NULL; } static inline const struct bpf_func_proto * cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { return NULL; } static inline int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *map) { return 0; } static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return NULL; } static inline void bpf_cgroup_storage_free( struct bpf_cgroup_storage *storage) {} static inline int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value) { return 0; } static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, void *value, u64 flags) { return 0; } #define cgroup_bpf_enabled(atype) (0) #define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx) ({ 0; }) #define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, atype) ({ 0; }) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, atype, flags) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; }) #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; }) #define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(atype, major, minor, access) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos) ({ 0; }) #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \ optlen, max_optlen, retval) ({ retval; }) #define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval, \ optlen, retval) ({ retval; }) #define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \ kernel_optval) ({ 0; }) #define for_each_cgroup_storage_type(stype) for (; false; ) #endif /* CONFIG_CGROUP_BPF */ #endif /* _BPF_CGROUP_H */ |
11 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_RTNETLINK_H #define __NET_RTNETLINK_H #include <linux/rtnetlink.h> #include <net/netlink.h> typedef int (*rtnl_doit_func)(struct sk_buff *, struct nlmsghdr *, struct netlink_ext_ack *); typedef int (*rtnl_dumpit_func)(struct sk_buff *, struct netlink_callback *); enum rtnl_link_flags { RTNL_FLAG_DOIT_UNLOCKED = BIT(0), RTNL_FLAG_BULK_DEL_SUPPORTED = BIT(1), }; enum rtnl_kinds { RTNL_KIND_NEW, RTNL_KIND_DEL, RTNL_KIND_GET, RTNL_KIND_SET }; #define RTNL_KIND_MASK 0x3 static inline enum rtnl_kinds rtnl_msgtype_kind(int msgtype) { return msgtype & RTNL_KIND_MASK; } void rtnl_register(int protocol, int msgtype, rtnl_doit_func, rtnl_dumpit_func, unsigned int flags); int rtnl_register_module(struct module *owner, int protocol, int msgtype, rtnl_doit_func, rtnl_dumpit_func, unsigned int flags); int rtnl_unregister(int protocol, int msgtype); void rtnl_unregister_all(int protocol); static inline int rtnl_msg_family(const struct nlmsghdr *nlh) { if (nlmsg_len(nlh) >= sizeof(struct rtgenmsg)) return ((struct rtgenmsg *) nlmsg_data(nlh))->rtgen_family; else return AF_UNSPEC; } /** * struct rtnl_link_ops - rtnetlink link operations * * @list: Used internally * @kind: Identifier * @netns_refund: Physical device, move to init_net on netns exit * @maxtype: Highest device specific netlink attribute number * @policy: Netlink policy for device specific attribute validation * @validate: Optional validation function for netlink/changelink parameters * @alloc: netdev allocation function, can be %NULL and is then used * in place of alloc_netdev_mqs(), in this case @priv_size * and @setup are unused. Returns a netdev or ERR_PTR(). * @priv_size: sizeof net_device private space * @setup: net_device setup function * @newlink: Function for configuring and registering a new device * @changelink: Function for changing parameters of an existing device * @dellink: Function to remove a device * @get_size: Function to calculate required room for dumping device * specific netlink attributes * @fill_info: Function to dump device specific netlink attributes * @get_xstats_size: Function to calculate required room for dumping device * specific statistics * @fill_xstats: Function to dump device specific statistics * @get_num_tx_queues: Function to determine number of transmit queues * to create when creating a new device. * @get_num_rx_queues: Function to determine number of receive queues * to create when creating a new device. * @get_link_net: Function to get the i/o netns of the device * @get_linkxstats_size: Function to calculate the required room for * dumping device-specific extended link stats * @fill_linkxstats: Function to dump device-specific extended link stats */ struct rtnl_link_ops { struct list_head list; const char *kind; size_t priv_size; struct net_device *(*alloc)(struct nlattr *tb[], const char *ifname, unsigned char name_assign_type, unsigned int num_tx_queues, unsigned int num_rx_queues); void (*setup)(struct net_device *dev); bool netns_refund; unsigned int maxtype; const struct nla_policy *policy; int (*validate)(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack); int (*newlink)(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack); int (*changelink)(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack); void (*dellink)(struct net_device *dev, struct list_head *head); size_t (*get_size)(const struct net_device *dev); int (*fill_info)(struct sk_buff *skb, const struct net_device *dev); size_t (*get_xstats_size)(const struct net_device *dev); int (*fill_xstats)(struct sk_buff *skb, const struct net_device *dev); unsigned int (*get_num_tx_queues)(void); unsigned int (*get_num_rx_queues)(void); unsigned int slave_maxtype; const struct nla_policy *slave_policy; int (*slave_changelink)(struct net_device *dev, struct net_device *slave_dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack); size_t (*get_slave_size)(const struct net_device *dev, const struct net_device *slave_dev); int (*fill_slave_info)(struct sk_buff *skb, const struct net_device *dev, const struct net_device *slave_dev); struct net *(*get_link_net)(const struct net_device *dev); size_t (*get_linkxstats_size)(const struct net_device *dev, int attr); int (*fill_linkxstats)(struct sk_buff *skb, const struct net_device *dev, int *prividx, int attr); }; int __rtnl_link_register(struct rtnl_link_ops *ops); void __rtnl_link_unregister(struct rtnl_link_ops *ops); int rtnl_link_register(struct rtnl_link_ops *ops); void rtnl_link_unregister(struct rtnl_link_ops *ops); /** * struct rtnl_af_ops - rtnetlink address family operations * * @list: Used internally * @family: Address family * @fill_link_af: Function to fill IFLA_AF_SPEC with address family * specific netlink attributes. * @get_link_af_size: Function to calculate size of address family specific * netlink attributes. * @validate_link_af: Validate a IFLA_AF_SPEC attribute, must check attr * for invalid configuration settings. * @set_link_af: Function to parse a IFLA_AF_SPEC attribute and modify * net_device accordingly. */ struct rtnl_af_ops { struct list_head list; int family; int (*fill_link_af)(struct sk_buff *skb, const struct net_device *dev, u32 ext_filter_mask); size_t (*get_link_af_size)(const struct net_device *dev, u32 ext_filter_mask); int (*validate_link_af)(const struct net_device *dev, const struct nlattr *attr, struct netlink_ext_ack *extack); int (*set_link_af)(struct net_device *dev, const struct nlattr *attr, struct netlink_ext_ack *extack); int (*fill_stats_af)(struct sk_buff *skb, const struct net_device *dev); size_t (*get_stats_af_size)(const struct net_device *dev); }; void rtnl_af_register(struct rtnl_af_ops *ops); void rtnl_af_unregister(struct rtnl_af_ops *ops); struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]); struct net_device *rtnl_create_link(struct net *net, const char *ifname, unsigned char name_assign_type, const struct rtnl_link_ops *ops, struct nlattr *tb[], struct netlink_ext_ack *extack); int rtnl_delete_link(struct net_device *dev, u32 portid, const struct nlmsghdr *nlh); int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm, u32 portid, const struct nlmsghdr *nlh); int rtnl_nla_parse_ifinfomsg(struct nlattr **tb, const struct nlattr *nla_peer, struct netlink_ext_ack *exterr); struct net *rtnl_get_net_ns_capable(struct sock *sk, int netnsid); #define MODULE_ALIAS_RTNL_LINK(kind) MODULE_ALIAS("rtnl-link-" kind) #endif |
1 2 2 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 | /* * net/tipc/diag.c: TIPC socket diag * * Copyright (c) 2018, Ericsson AB * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "ASIS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "socket.h" #include <linux/sock_diag.h> #include <linux/tipc_sockets_diag.h> static u64 __tipc_diag_gen_cookie(struct sock *sk) { u32 res[2]; sock_diag_save_cookie(sk, res); return *((u64 *)res); } static int __tipc_add_sock_diag(struct sk_buff *skb, struct netlink_callback *cb, struct tipc_sock *tsk) { struct tipc_sock_diag_req *req = nlmsg_data(cb->nlh); struct nlmsghdr *nlh; int err; nlh = nlmsg_put_answer(skb, cb, SOCK_DIAG_BY_FAMILY, 0, NLM_F_MULTI); if (!nlh) return -EMSGSIZE; err = tipc_sk_fill_sock_diag(skb, cb, tsk, req->tidiag_states, __tipc_diag_gen_cookie); if (err) return err; nlmsg_end(skb, nlh); return 0; } static int tipc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) { return tipc_nl_sk_walk(skb, cb, __tipc_add_sock_diag); } static int tipc_sock_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) { int hdrlen = sizeof(struct tipc_sock_diag_req); struct net *net = sock_net(skb->sk); if (nlmsg_len(h) < hdrlen) return -EINVAL; if (h->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = tipc_dump_start, .dump = tipc_diag_dump, .done = tipc_dump_done, }; netlink_dump_start(net->diag_nlsk, skb, h, &c); return 0; } return -EOPNOTSUPP; } static const struct sock_diag_handler tipc_sock_diag_handler = { .family = AF_TIPC, .dump = tipc_sock_diag_handler_dump, }; static int __init tipc_diag_init(void) { return sock_diag_register(&tipc_sock_diag_handler); } static void __exit tipc_diag_exit(void) { sock_diag_unregister(&tipc_sock_diag_handler); } module_init(tipc_diag_init); module_exit(tipc_diag_exit); MODULE_LICENSE("Dual BSD/GPL"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, AF_TIPC); |
1244 269 1244 1292 173 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_USER_NAMESPACE_H #define _LINUX_USER_NAMESPACE_H #include <linux/kref.h> #include <linux/nsproxy.h> #include <linux/ns_common.h> #include <linux/sched.h> #include <linux/workqueue.h> #include <linux/rwsem.h> #include <linux/sysctl.h> #include <linux/err.h> #define UID_GID_MAP_MAX_BASE_EXTENTS 5 #define UID_GID_MAP_MAX_EXTENTS 340 struct uid_gid_extent { u32 first; u32 lower_first; u32 count; }; struct uid_gid_map { /* 64 bytes -- 1 cache line */ u32 nr_extents; union { struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS]; struct { struct uid_gid_extent *forward; struct uid_gid_extent *reverse; }; }; }; #define USERNS_SETGROUPS_ALLOWED 1UL #define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED struct ucounts; enum ucount_type { UCOUNT_USER_NAMESPACES, UCOUNT_PID_NAMESPACES, UCOUNT_UTS_NAMESPACES, UCOUNT_IPC_NAMESPACES, UCOUNT_NET_NAMESPACES, UCOUNT_MNT_NAMESPACES, UCOUNT_CGROUP_NAMESPACES, UCOUNT_TIME_NAMESPACES, #ifdef CONFIG_INOTIFY_USER UCOUNT_INOTIFY_INSTANCES, UCOUNT_INOTIFY_WATCHES, #endif #ifdef CONFIG_FANOTIFY UCOUNT_FANOTIFY_GROUPS, UCOUNT_FANOTIFY_MARKS, #endif UCOUNT_COUNTS, }; enum rlimit_type { UCOUNT_RLIMIT_NPROC, UCOUNT_RLIMIT_MSGQUEUE, UCOUNT_RLIMIT_SIGPENDING, UCOUNT_RLIMIT_MEMLOCK, UCOUNT_RLIMIT_COUNTS, }; struct user_namespace { struct uid_gid_map uid_map; struct uid_gid_map gid_map; struct uid_gid_map projid_map; struct user_namespace *parent; int level; kuid_t owner; kgid_t group; struct ns_common ns; unsigned long flags; /* parent_could_setfcap: true if the creator if this ns had CAP_SETFCAP * in its effective capability set at the child ns creation time. */ bool parent_could_setfcap; #ifdef CONFIG_KEYS /* List of joinable keyrings in this namespace. Modification access of * these pointers is controlled by keyring_sem. Once * user_keyring_register is set, it won't be changed, so it can be * accessed directly with READ_ONCE(). */ struct list_head keyring_name_list; struct key *user_keyring_register; struct rw_semaphore keyring_sem; #endif /* Register of per-UID persistent keyrings for this namespace */ #ifdef CONFIG_PERSISTENT_KEYRINGS struct key *persistent_keyring_register; #endif struct work_struct work; #ifdef CONFIG_SYSCTL struct ctl_table_set set; struct ctl_table_header *sysctls; #endif struct ucounts *ucounts; long ucount_max[UCOUNT_COUNTS]; long rlimit_max[UCOUNT_RLIMIT_COUNTS]; } __randomize_layout; struct ucounts { struct hlist_node node; struct user_namespace *ns; kuid_t uid; atomic_t count; atomic_long_t ucount[UCOUNT_COUNTS]; atomic_long_t rlimit[UCOUNT_RLIMIT_COUNTS]; }; extern struct user_namespace init_user_ns; extern struct ucounts init_ucounts; bool setup_userns_sysctls(struct user_namespace *ns); void retire_userns_sysctls(struct user_namespace *ns); struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type); void dec_ucount(struct ucounts *ucounts, enum ucount_type type); struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid); struct ucounts * __must_check get_ucounts(struct ucounts *ucounts); void put_ucounts(struct ucounts *ucounts); static inline long get_rlimit_value(struct ucounts *ucounts, enum rlimit_type type) { return atomic_long_read(&ucounts->rlimit[type]); } long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v); bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v); long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type); void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type); bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long max); static inline long get_userns_rlimit_max(struct user_namespace *ns, enum rlimit_type type) { return READ_ONCE(ns->rlimit_max[type]); } static inline void set_userns_rlimit_max(struct user_namespace *ns, enum rlimit_type type, unsigned long max) { ns->rlimit_max[type] = max <= LONG_MAX ? max : LONG_MAX; } #ifdef CONFIG_USER_NS static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { if (ns) refcount_inc(&ns->ns.count); return ns; } extern int create_user_ns(struct cred *new); extern int unshare_userns(unsigned long unshare_flags, struct cred **new_cred); extern void __put_user_ns(struct user_namespace *ns); static inline void put_user_ns(struct user_namespace *ns) { if (ns && refcount_dec_and_test(&ns->ns.count)) __put_user_ns(ns); } struct seq_operations; extern const struct seq_operations proc_uid_seq_operations; extern const struct seq_operations proc_gid_seq_operations; extern const struct seq_operations proc_projid_seq_operations; extern ssize_t proc_uid_map_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t proc_gid_map_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t, loff_t *); extern int proc_setgroups_show(struct seq_file *m, void *v); extern bool userns_may_setgroups(const struct user_namespace *ns); extern bool in_userns(const struct user_namespace *ancestor, const struct user_namespace *child); extern bool current_in_userns(const struct user_namespace *target_ns); struct ns_common *ns_get_owner(struct ns_common *ns); #else static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { return &init_user_ns; } static inline int create_user_ns(struct cred *new) { return -EINVAL; } static inline int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) { if (unshare_flags & CLONE_NEWUSER) return -EINVAL; return 0; } static inline void put_user_ns(struct user_namespace *ns) { } static inline bool userns_may_setgroups(const struct user_namespace *ns) { return true; } static inline bool in_userns(const struct user_namespace *ancestor, const struct user_namespace *child) { return true; } static inline bool current_in_userns(const struct user_namespace *target_ns) { return true; } static inline struct ns_common *ns_get_owner(struct ns_common *ns) { return ERR_PTR(-EPERM); } #endif #endif /* _LINUX_USER_H */ |
4 4 11 11 2 2 2 1 2 5 5 5 8 7 8 3 2 2 2 1 2 7 6 5 3 2 2 4 2 2 2 2 7 4 2 4 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2018 Facebook */ #include <linux/bpf.h> #include <linux/err.h> #include <linux/sock_diag.h> #include <net/sock_reuseport.h> #include <linux/btf_ids.h> struct reuseport_array { struct bpf_map map; struct sock __rcu *ptrs[]; }; static struct reuseport_array *reuseport_array(struct bpf_map *map) { return (struct reuseport_array *)map; } /* The caller must hold the reuseport_lock */ void bpf_sk_reuseport_detach(struct sock *sk) { struct sock __rcu **socks; write_lock_bh(&sk->sk_callback_lock); socks = __locked_read_sk_user_data_with_flags(sk, SK_USER_DATA_BPF); if (socks) { WRITE_ONCE(sk->sk_user_data, NULL); /* * Do not move this NULL assignment outside of * sk->sk_callback_lock because there is * a race with reuseport_array_free() * which does not hold the reuseport_lock. */ RCU_INIT_POINTER(*socks, NULL); } write_unlock_bh(&sk->sk_callback_lock); } static int reuseport_array_alloc_check(union bpf_attr *attr) { if (attr->value_size != sizeof(u32) && attr->value_size != sizeof(u64)) return -EINVAL; return array_map_alloc_check(attr); } static void *reuseport_array_lookup_elem(struct bpf_map *map, void *key) { struct reuseport_array *array = reuseport_array(map); u32 index = *(u32 *)key; if (unlikely(index >= array->map.max_entries)) return NULL; return rcu_dereference(array->ptrs[index]); } /* Called from syscall only */ static long reuseport_array_delete_elem(struct bpf_map *map, void *key) { struct reuseport_array *array = reuseport_array(map); u32 index = *(u32 *)key; struct sock *sk; int err; if (index >= map->max_entries) return -E2BIG; if (!rcu_access_pointer(array->ptrs[index])) return -ENOENT; spin_lock_bh(&reuseport_lock); sk = rcu_dereference_protected(array->ptrs[index], lockdep_is_held(&reuseport_lock)); if (sk) { write_lock_bh(&sk->sk_callback_lock); WRITE_ONCE(sk->sk_user_data, NULL); RCU_INIT_POINTER(array->ptrs[index], NULL); write_unlock_bh(&sk->sk_callback_lock); err = 0; } else { err = -ENOENT; } spin_unlock_bh(&reuseport_lock); return err; } static void reuseport_array_free(struct bpf_map *map) { struct reuseport_array *array = reuseport_array(map); struct sock *sk; u32 i; /* * ops->map_*_elem() will not be able to access this * array now. Hence, this function only races with * bpf_sk_reuseport_detach() which was triggered by * close() or disconnect(). * * This function and bpf_sk_reuseport_detach() are * both removing sk from "array". Who removes it * first does not matter. * * The only concern here is bpf_sk_reuseport_detach() * may access "array" which is being freed here. * bpf_sk_reuseport_detach() access this "array" * through sk->sk_user_data _and_ with sk->sk_callback_lock * held which is enough because this "array" is not freed * until all sk->sk_user_data has stopped referencing this "array". * * Hence, due to the above, taking "reuseport_lock" is not * needed here. */ /* * Since reuseport_lock is not taken, sk is accessed under * rcu_read_lock() */ rcu_read_lock(); for (i = 0; i < map->max_entries; i++) { sk = rcu_dereference(array->ptrs[i]); if (sk) { write_lock_bh(&sk->sk_callback_lock); /* * No need for WRITE_ONCE(). At this point, * no one is reading it without taking the * sk->sk_callback_lock. */ sk->sk_user_data = NULL; write_unlock_bh(&sk->sk_callback_lock); RCU_INIT_POINTER(array->ptrs[i], NULL); } } rcu_read_unlock(); /* * Once reaching here, all sk->sk_user_data is not * referencing this "array". "array" can be freed now. */ bpf_map_area_free(array); } static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); struct reuseport_array *array; /* allocate all map elements and zero-initialize them */ array = bpf_map_area_alloc(struct_size(array, ptrs, attr->max_entries), numa_node); if (!array) return ERR_PTR(-ENOMEM); /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); return &array->map; } int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, void *value) { struct sock *sk; int err; if (map->value_size != sizeof(u64)) return -ENOSPC; rcu_read_lock(); sk = reuseport_array_lookup_elem(map, key); if (sk) { *(u64 *)value = __sock_gen_cookie(sk); err = 0; } else { err = -ENOENT; } rcu_read_unlock(); return err; } static int reuseport_array_update_check(const struct reuseport_array *array, const struct sock *nsk, const struct sock *osk, const struct sock_reuseport *nsk_reuse, u32 map_flags) { if (osk && map_flags == BPF_NOEXIST) return -EEXIST; if (!osk && map_flags == BPF_EXIST) return -ENOENT; if (nsk->sk_protocol != IPPROTO_UDP && nsk->sk_protocol != IPPROTO_TCP) return -ENOTSUPP; if (nsk->sk_family != AF_INET && nsk->sk_family != AF_INET6) return -ENOTSUPP; if (nsk->sk_type != SOCK_STREAM && nsk->sk_type != SOCK_DGRAM) return -ENOTSUPP; /* * sk must be hashed (i.e. listening in the TCP case or binded * in the UDP case) and * it must also be a SO_REUSEPORT sk (i.e. reuse cannot be NULL). * * Also, sk will be used in bpf helper that is protected by * rcu_read_lock(). */ if (!sock_flag(nsk, SOCK_RCU_FREE) || !sk_hashed(nsk) || !nsk_reuse) return -EINVAL; /* READ_ONCE because the sk->sk_callback_lock may not be held here */ if (READ_ONCE(nsk->sk_user_data)) return -EBUSY; return 0; } /* * Called from syscall only. * The "nsk" in the fd refcnt. * The "osk" and "reuse" are protected by reuseport_lock. */ int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct reuseport_array *array = reuseport_array(map); struct sock *free_osk = NULL, *osk, *nsk; struct sock_reuseport *reuse; u32 index = *(u32 *)key; uintptr_t sk_user_data; struct socket *socket; int err, fd; if (map_flags > BPF_EXIST) return -EINVAL; if (index >= map->max_entries) return -E2BIG; if (map->value_size == sizeof(u64)) { u64 fd64 = *(u64 *)value; if (fd64 > S32_MAX) return -EINVAL; fd = fd64; } else { fd = *(int *)value; } socket = sockfd_lookup(fd, &err); if (!socket) return err; nsk = socket->sk; if (!nsk) { err = -EINVAL; goto put_file; } /* Quick checks before taking reuseport_lock */ err = reuseport_array_update_check(array, nsk, rcu_access_pointer(array->ptrs[index]), rcu_access_pointer(nsk->sk_reuseport_cb), map_flags); if (err) goto put_file; spin_lock_bh(&reuseport_lock); /* * Some of the checks only need reuseport_lock * but it is done under sk_callback_lock also * for simplicity reason. */ write_lock_bh(&nsk->sk_callback_lock); osk = rcu_dereference_protected(array->ptrs[index], lockdep_is_held(&reuseport_lock)); reuse = rcu_dereference_protected(nsk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); err = reuseport_array_update_check(array, nsk, osk, reuse, map_flags); if (err) goto put_file_unlock; sk_user_data = (uintptr_t)&array->ptrs[index] | SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF; WRITE_ONCE(nsk->sk_user_data, (void *)sk_user_data); rcu_assign_pointer(array->ptrs[index], nsk); free_osk = osk; err = 0; put_file_unlock: write_unlock_bh(&nsk->sk_callback_lock); if (free_osk) { write_lock_bh(&free_osk->sk_callback_lock); WRITE_ONCE(free_osk->sk_user_data, NULL); write_unlock_bh(&free_osk->sk_callback_lock); } spin_unlock_bh(&reuseport_lock); put_file: fput(socket->file); return err; } /* Called from syscall */ static int reuseport_array_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct reuseport_array *array = reuseport_array(map); u32 index = key ? *(u32 *)key : U32_MAX; u32 *next = (u32 *)next_key; if (index >= array->map.max_entries) { *next = 0; return 0; } if (index == array->map.max_entries - 1) return -ENOENT; *next = index + 1; return 0; } static u64 reuseport_array_mem_usage(const struct bpf_map *map) { struct reuseport_array *array; return struct_size(array, ptrs, map->max_entries); } BTF_ID_LIST_SINGLE(reuseport_array_map_btf_ids, struct, reuseport_array) const struct bpf_map_ops reuseport_array_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = reuseport_array_alloc_check, .map_alloc = reuseport_array_alloc, .map_free = reuseport_array_free, .map_lookup_elem = reuseport_array_lookup_elem, .map_get_next_key = reuseport_array_get_next_key, .map_delete_elem = reuseport_array_delete_elem, .map_mem_usage = reuseport_array_mem_usage, .map_btf_id = &reuseport_array_map_btf_ids[0], }; |
1 5574 129 9171 1486 8445 1482 129 1 5576 312 130 10 5687 5687 5681 127 2295 451 408 5573 5573 5570 1626 69 65 10 10 10 5573 5521 5571 5522 1519 5523 5577 5576 5572 129 129 10 6 10 1 128 2 129 129 2019 2261 2147 2213 2260 129 129 129 9 129 2 2 5575 5578 5574 5574 5552 1522 4560 5578 5576 2280 1565 1565 46 1542 5364 5578 5574 5575 5570 5441 5571 129 5577 4266 4274 4260 4276 4274 2305 2304 2304 2305 1963 556 556 205 435 414 436 436 1946 1946 1946 1946 1945 1945 24 24 24 21 21 19 1 20 271 270 271 271 271 1 270 270 270 271 271 271 271 271 269 270 271 271 271 1 1 271 1 1 1 1 1 1 1 1 1 271 448 80 1 79 19 20 1 76 447 448 447 448 42 409 409 409 409 409 409 409 133 312 312 311 312 312 312 296 32 32 32 32 33 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 33 33 33 32 32 32 32 32 32 32 33 33 33 32 32 33 33 33 32 32 33 32 32 34 34 34 34 33 33 33 32 32 32 32 32 32 32 1401 1399 1401 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 | // SPDX-License-Identifier: GPL-2.0-only /* * kernel/workqueue.c - generic async execution with shared worker pool * * Copyright (C) 2002 Ingo Molnar * * Derived from the taskqueue/keventd code by: * David Woodhouse <dwmw2@infradead.org> * Andrew Morton * Kai Petzke <wpp@marie.physik.tu-berlin.de> * Theodore Ts'o <tytso@mit.edu> * * Made to use alloc_percpu by Christoph Lameter. * * Copyright (C) 2010 SUSE Linux Products GmbH * Copyright (C) 2010 Tejun Heo <tj@kernel.org> * * This is the generic async execution mechanism. Work items as are * executed in process context. The worker pool is shared and * automatically managed. There are two worker pools for each CPU (one for * normal work items and the other for high priority ones) and some extra * pools for workqueues which are not bound to any specific CPU - the * number of these backing pools is dynamic. * * Please read Documentation/core-api/workqueue.rst for details. */ #include <linux/export.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/init.h> #include <linux/signal.h> #include <linux/completion.h> #include <linux/workqueue.h> #include <linux/slab.h> #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/kthread.h> #include <linux/hardirq.h> #include <linux/mempolicy.h> #include <linux/freezer.h> #include <linux/debug_locks.h> #include <linux/lockdep.h> #include <linux/idr.h> #include <linux/jhash.h> #include <linux/hashtable.h> #include <linux/rculist.h> #include <linux/nodemask.h> #include <linux/moduleparam.h> #include <linux/uaccess.h> #include <linux/sched/isolation.h> #include <linux/sched/debug.h> #include <linux/nmi.h> #include <linux/kvm_para.h> #include <linux/delay.h> #include "workqueue_internal.h" enum { /* * worker_pool flags * * A bound pool is either associated or disassociated with its CPU. * While associated (!DISASSOCIATED), all workers are bound to the * CPU and none has %WORKER_UNBOUND set and concurrency management * is in effect. * * While DISASSOCIATED, the cpu may be offline and all workers have * %WORKER_UNBOUND set and concurrency management disabled, and may * be executing on any CPU. The pool behaves as an unbound one. * * Note that DISASSOCIATED should be flipped only while holding * wq_pool_attach_mutex to avoid changing binding state while * worker_attach_to_pool() is in progress. */ POOL_MANAGER_ACTIVE = 1 << 0, /* being managed */ POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ /* worker flags */ WORKER_DIE = 1 << 1, /* die die die */ WORKER_IDLE = 1 << 2, /* is idle */ WORKER_PREP = 1 << 3, /* preparing to run works */ WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ WORKER_UNBOUND = 1 << 7, /* worker is unbound */ WORKER_REBOUND = 1 << 8, /* worker was rebound */ WORKER_NOT_RUNNING = WORKER_PREP | WORKER_CPU_INTENSIVE | WORKER_UNBOUND | WORKER_REBOUND, NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */ UNBOUND_POOL_HASH_ORDER = 6, /* hashed by pool->attrs */ BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */ MAYDAY_INITIAL_TIMEOUT = HZ / 100 >= 2 ? HZ / 100 : 2, /* call for help after 10ms (min two ticks) */ MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ CREATE_COOLDOWN = HZ, /* time to breath after fail */ /* * Rescue workers are used only on emergencies and shared by * all cpus. Give MIN_NICE. */ RESCUER_NICE_LEVEL = MIN_NICE, HIGHPRI_NICE_LEVEL = MIN_NICE, WQ_NAME_LEN = 24, }; /* * Structure fields follow one of the following exclusion rules. * * I: Modifiable by initialization/destruction paths and read-only for * everyone else. * * P: Preemption protected. Disabling preemption is enough and should * only be modified and accessed from the local cpu. * * L: pool->lock protected. Access with pool->lock held. * * K: Only modified by worker while holding pool->lock. Can be safely read by * self, while holding pool->lock or from IRQ context if %current is the * kworker. * * S: Only modified by worker self. * * A: wq_pool_attach_mutex protected. * * PL: wq_pool_mutex protected. * * PR: wq_pool_mutex protected for writes. RCU protected for reads. * * PW: wq_pool_mutex and wq->mutex protected for writes. Either for reads. * * PWR: wq_pool_mutex and wq->mutex protected for writes. Either or * RCU for reads. * * WQ: wq->mutex protected. * * WR: wq->mutex protected for writes. RCU protected for reads. * * MD: wq_mayday_lock protected. * * WD: Used internally by the watchdog. */ /* struct worker is defined in workqueue_internal.h */ struct worker_pool { raw_spinlock_t lock; /* the pool lock */ int cpu; /* I: the associated cpu */ int node; /* I: the associated node ID */ int id; /* I: pool ID */ unsigned int flags; /* L: flags */ unsigned long watchdog_ts; /* L: watchdog timestamp */ bool cpu_stall; /* WD: stalled cpu bound pool */ /* * The counter is incremented in a process context on the associated CPU * w/ preemption disabled, and decremented or reset in the same context * but w/ pool->lock held. The readers grab pool->lock and are * guaranteed to see if the counter reached zero. */ int nr_running; struct list_head worklist; /* L: list of pending works */ int nr_workers; /* L: total number of workers */ int nr_idle; /* L: currently idle workers */ struct list_head idle_list; /* L: list of idle workers */ struct timer_list idle_timer; /* L: worker idle timeout */ struct work_struct idle_cull_work; /* L: worker idle cleanup */ struct timer_list mayday_timer; /* L: SOS timer for workers */ /* a workers is either on busy_hash or idle_list, or the manager */ DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER); /* L: hash of busy workers */ struct worker *manager; /* L: purely informational */ struct list_head workers; /* A: attached workers */ struct list_head dying_workers; /* A: workers about to die */ struct completion *detach_completion; /* all workers detached */ struct ida worker_ida; /* worker IDs for task name */ struct workqueue_attrs *attrs; /* I: worker attributes */ struct hlist_node hash_node; /* PL: unbound_pool_hash node */ int refcnt; /* PL: refcnt for unbound pools */ /* * Destruction of pool is RCU protected to allow dereferences * from get_work_pool(). */ struct rcu_head rcu; }; /* * Per-pool_workqueue statistics. These can be monitored using * tools/workqueue/wq_monitor.py. */ enum pool_workqueue_stats { PWQ_STAT_STARTED, /* work items started execution */ PWQ_STAT_COMPLETED, /* work items completed execution */ PWQ_STAT_CPU_TIME, /* total CPU time consumed */ PWQ_STAT_CPU_INTENSIVE, /* wq_cpu_intensive_thresh_us violations */ PWQ_STAT_CM_WAKEUP, /* concurrency-management worker wakeups */ PWQ_STAT_REPATRIATED, /* unbound workers brought back into scope */ PWQ_STAT_MAYDAY, /* maydays to rescuer */ PWQ_STAT_RESCUED, /* linked work items executed by rescuer */ PWQ_NR_STATS, }; /* * The per-pool workqueue. While queued, the lower WORK_STRUCT_FLAG_BITS * of work_struct->data are used for flags and the remaining high bits * point to the pwq; thus, pwqs need to be aligned at two's power of the * number of flag bits. */ struct pool_workqueue { struct worker_pool *pool; /* I: the associated pool */ struct workqueue_struct *wq; /* I: the owning workqueue */ int work_color; /* L: current color */ int flush_color; /* L: flushing color */ int refcnt; /* L: reference count */ int nr_in_flight[WORK_NR_COLORS]; /* L: nr of in_flight works */ /* * nr_active management and WORK_STRUCT_INACTIVE: * * When pwq->nr_active >= max_active, new work item is queued to * pwq->inactive_works instead of pool->worklist and marked with * WORK_STRUCT_INACTIVE. * * All work items marked with WORK_STRUCT_INACTIVE do not participate * in pwq->nr_active and all work items in pwq->inactive_works are * marked with WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE * work items are in pwq->inactive_works. Some of them are ready to * run in pool->worklist or worker->scheduled. Those work itmes are * only struct wq_barrier which is used for flush_work() and should * not participate in pwq->nr_active. For non-barrier work item, it * is marked with WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works. */ int nr_active; /* L: nr of active works */ int max_active; /* L: max active works */ struct list_head inactive_works; /* L: inactive works */ struct list_head pwqs_node; /* WR: node on wq->pwqs */ struct list_head mayday_node; /* MD: node on wq->maydays */ u64 stats[PWQ_NR_STATS]; /* * Release of unbound pwq is punted to a kthread_worker. See put_pwq() * and pwq_release_workfn() for details. pool_workqueue itself is also * RCU protected so that the first pwq can be determined without * grabbing wq->mutex. */ struct kthread_work release_work; struct rcu_head rcu; } __aligned(1 << WORK_STRUCT_FLAG_BITS); /* * Structure used to wait for workqueue flush. */ struct wq_flusher { struct list_head list; /* WQ: list of flushers */ int flush_color; /* WQ: flush color waiting for */ struct completion done; /* flush completion */ }; struct wq_device; /* * The externally visible workqueue. It relays the issued work items to * the appropriate worker_pool through its pool_workqueues. */ struct workqueue_struct { struct list_head pwqs; /* WR: all pwqs of this wq */ struct list_head list; /* PR: list of all workqueues */ struct mutex mutex; /* protects this wq */ int work_color; /* WQ: current work color */ int flush_color; /* WQ: current flush color */ atomic_t nr_pwqs_to_flush; /* flush in progress */ struct wq_flusher *first_flusher; /* WQ: first flusher */ struct list_head flusher_queue; /* WQ: flush waiters */ struct list_head flusher_overflow; /* WQ: flush overflow list */ struct list_head maydays; /* MD: pwqs requesting rescue */ struct worker *rescuer; /* MD: rescue worker */ int nr_drainers; /* WQ: drain in progress */ int saved_max_active; /* WQ: saved pwq max_active */ struct workqueue_attrs *unbound_attrs; /* PW: only for unbound wqs */ struct pool_workqueue *dfl_pwq; /* PW: only for unbound wqs */ #ifdef CONFIG_SYSFS struct wq_device *wq_dev; /* I: for sysfs interface */ #endif #ifdef CONFIG_LOCKDEP char *lock_name; struct lock_class_key key; struct lockdep_map lockdep_map; #endif char name[WQ_NAME_LEN]; /* I: workqueue name */ /* * Destruction of workqueue_struct is RCU protected to allow walking * the workqueues list without grabbing wq_pool_mutex. * This is used to dump all workqueues from sysrq. */ struct rcu_head rcu; /* hot fields used during command issue, aligned to cacheline */ unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ struct pool_workqueue __percpu __rcu **cpu_pwq; /* I: per-cpu pwqs */ }; static struct kmem_cache *pwq_cache; /* * Each pod type describes how CPUs should be grouped for unbound workqueues. * See the comment above workqueue_attrs->affn_scope. */ struct wq_pod_type { int nr_pods; /* number of pods */ cpumask_var_t *pod_cpus; /* pod -> cpus */ int *pod_node; /* pod -> node */ int *cpu_pod; /* cpu -> pod */ }; static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES]; static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_CACHE; static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = { [WQ_AFFN_DFL] = "default", [WQ_AFFN_CPU] = "cpu", [WQ_AFFN_SMT] = "smt", [WQ_AFFN_CACHE] = "cache", [WQ_AFFN_NUMA] = "numa", [WQ_AFFN_SYSTEM] = "system", }; /* * Per-cpu work items which run for longer than the following threshold are * automatically considered CPU intensive and excluded from concurrency * management to prevent them from noticeably delaying other per-cpu work items. * ULONG_MAX indicates that the user hasn't overridden it with a boot parameter. * The actual value is initialized in wq_cpu_intensive_thresh_init(). */ static unsigned long wq_cpu_intensive_thresh_us = ULONG_MAX; module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644); /* see the comment above the definition of WQ_POWER_EFFICIENT */ static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT); module_param_named(power_efficient, wq_power_efficient, bool, 0444); static bool wq_online; /* can kworkers be created yet? */ /* buf for wq_update_unbound_pod_attrs(), protected by CPU hotplug exclusion */ static struct workqueue_attrs *wq_update_pod_attrs_buf; static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */ static DEFINE_RAW_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ /* wait for manager to go away */ static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait); static LIST_HEAD(workqueues); /* PR: list of all workqueues */ static bool workqueue_freezing; /* PL: have wqs started freezing? */ /* PL&A: allowable cpus for unbound wqs and work items */ static cpumask_var_t wq_unbound_cpumask; /* for further constrain wq_unbound_cpumask by cmdline parameter*/ static struct cpumask wq_cmdline_cpumask __initdata; /* CPU where unbound work was last round robin scheduled from this CPU */ static DEFINE_PER_CPU(int, wq_rr_cpu_last); /* * Local execution of unbound work items is no longer guaranteed. The * following always forces round-robin CPU selection on unbound work items * to uncover usages which depend on it. */ #ifdef CONFIG_DEBUG_WQ_FORCE_RR_CPU static bool wq_debug_force_rr_cpu = true; #else static bool wq_debug_force_rr_cpu = false; #endif module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644); /* the per-cpu worker pools */ static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools); static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */ /* PL: hash of all unbound pools keyed by pool->attrs */ static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER); /* I: attributes used when instantiating standard unbound pools on demand */ static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; /* I: attributes used when instantiating ordered pools on demand */ static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS]; /* * I: kthread_worker to release pwq's. pwq release needs to be bounced to a * process context while holding a pool lock. Bounce to a dedicated kthread * worker to avoid A-A deadlocks. */ static struct kthread_worker *pwq_release_worker; struct workqueue_struct *system_wq __read_mostly; EXPORT_SYMBOL(system_wq); struct workqueue_struct *system_highpri_wq __read_mostly; EXPORT_SYMBOL_GPL(system_highpri_wq); struct workqueue_struct *system_long_wq __read_mostly; EXPORT_SYMBOL_GPL(system_long_wq); struct workqueue_struct *system_unbound_wq __read_mostly; EXPORT_SYMBOL_GPL(system_unbound_wq); struct workqueue_struct *system_freezable_wq __read_mostly; EXPORT_SYMBOL_GPL(system_freezable_wq); struct workqueue_struct *system_power_efficient_wq __read_mostly; EXPORT_SYMBOL_GPL(system_power_efficient_wq); struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly; EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq); static int worker_thread(void *__worker); static void workqueue_sysfs_unregister(struct workqueue_struct *wq); static void show_pwq(struct pool_workqueue *pwq); static void show_one_worker_pool(struct worker_pool *pool); #define CREATE_TRACE_POINTS #include <trace/events/workqueue.h> #define assert_rcu_or_pool_mutex() \ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ !lockdep_is_held(&wq_pool_mutex), \ "RCU or wq_pool_mutex should be held") #define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ !lockdep_is_held(&wq->mutex) && \ !lockdep_is_held(&wq_pool_mutex), \ "RCU, wq->mutex or wq_pool_mutex should be held") #define for_each_cpu_worker_pool(pool, cpu) \ for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ (pool)++) /** * for_each_pool - iterate through all worker_pools in the system * @pool: iteration cursor * @pi: integer used for iteration * * This must be called either with wq_pool_mutex held or RCU read * locked. If the pool needs to be used beyond the locking in effect, the * caller is responsible for guaranteeing that the pool stays online. * * The if/else clause exists only for the lockdep assertion and can be * ignored. */ #define for_each_pool(pool, pi) \ idr_for_each_entry(&worker_pool_idr, pool, pi) \ if (({ assert_rcu_or_pool_mutex(); false; })) { } \ else /** * for_each_pool_worker - iterate through all workers of a worker_pool * @worker: iteration cursor * @pool: worker_pool to iterate workers of * * This must be called with wq_pool_attach_mutex. * * The if/else clause exists only for the lockdep assertion and can be * ignored. */ #define for_each_pool_worker(worker, pool) \ list_for_each_entry((worker), &(pool)->workers, node) \ if (({ lockdep_assert_held(&wq_pool_attach_mutex); false; })) { } \ else /** * for_each_pwq - iterate through all pool_workqueues of the specified workqueue * @pwq: iteration cursor * @wq: the target workqueue * * This must be called either with wq->mutex held or RCU read locked. * If the pwq needs to be used beyond the locking in effect, the caller is * responsible for guaranteeing that the pwq stays online. * * The if/else clause exists only for the lockdep assertion and can be * ignored. */ #define for_each_pwq(pwq, wq) \ list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node, \ lockdep_is_held(&(wq->mutex))) #ifdef CONFIG_DEBUG_OBJECTS_WORK static const struct debug_obj_descr work_debug_descr; static void *work_debug_hint(void *addr) { return ((struct work_struct *) addr)->func; } static bool work_is_static_object(void *addr) { struct work_struct *work = addr; return test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work)); } /* * fixup_init is called when: * - an active object is initialized */ static bool work_fixup_init(void *addr, enum debug_obj_state state) { struct work_struct *work = addr; switch (state) { case ODEBUG_STATE_ACTIVE: cancel_work_sync(work); debug_object_init(work, &work_debug_descr); return true; default: return false; } } /* * fixup_free is called when: * - an active object is freed */ static bool work_fixup_free(void *addr, enum debug_obj_state state) { struct work_struct *work = addr; switch (state) { case ODEBUG_STATE_ACTIVE: cancel_work_sync(work); debug_object_free(work, &work_debug_descr); return true; default: return false; } } static const struct debug_obj_descr work_debug_descr = { .name = "work_struct", .debug_hint = work_debug_hint, .is_static_object = work_is_static_object, .fixup_init = work_fixup_init, .fixup_free = work_fixup_free, }; static inline void debug_work_activate(struct work_struct *work) { debug_object_activate(work, &work_debug_descr); } static inline void debug_work_deactivate(struct work_struct *work) { debug_object_deactivate(work, &work_debug_descr); } void __init_work(struct work_struct *work, int onstack) { if (onstack) debug_object_init_on_stack(work, &work_debug_descr); else debug_object_init(work, &work_debug_descr); } EXPORT_SYMBOL_GPL(__init_work); void destroy_work_on_stack(struct work_struct *work) { debug_object_free(work, &work_debug_descr); } EXPORT_SYMBOL_GPL(destroy_work_on_stack); void destroy_delayed_work_on_stack(struct delayed_work *work) { destroy_timer_on_stack(&work->timer); debug_object_free(&work->work, &work_debug_descr); } EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack); #else static inline void debug_work_activate(struct work_struct *work) { } static inline void debug_work_deactivate(struct work_struct *work) { } #endif /** * worker_pool_assign_id - allocate ID and assign it to @pool * @pool: the pool pointer of interest * * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned * successfully, -errno on failure. */ static int worker_pool_assign_id(struct worker_pool *pool) { int ret; lockdep_assert_held(&wq_pool_mutex); ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE, GFP_KERNEL); if (ret >= 0) { pool->id = ret; return 0; } return ret; } static unsigned int work_color_to_flags(int color) { return color << WORK_STRUCT_COLOR_SHIFT; } static int get_work_color(unsigned long work_data) { return (work_data >> WORK_STRUCT_COLOR_SHIFT) & ((1 << WORK_STRUCT_COLOR_BITS) - 1); } static int work_next_color(int color) { return (color + 1) % WORK_NR_COLORS; } /* * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data * contain the pointer to the queued pwq. Once execution starts, the flag * is cleared and the high bits contain OFFQ flags and pool ID. * * set_work_pwq(), set_work_pool_and_clear_pending(), mark_work_canceling() * and clear_work_data() can be used to set the pwq, pool or clear * work->data. These functions should only be called while the work is * owned - ie. while the PENDING bit is set. * * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq * corresponding to a work. Pool is available once the work has been * queued anywhere after initialization until it is sync canceled. pwq is * available only while the work item is queued. * * %WORK_OFFQ_CANCELING is used to mark a work item which is being * canceled. While being canceled, a work item may have its PENDING set * but stay off timer and worklist for arbitrarily long and nobody should * try to steal the PENDING bit. */ static inline void set_work_data(struct work_struct *work, unsigned long data, unsigned long flags) { WARN_ON_ONCE(!work_pending(work)); atomic_long_set(&work->data, data | flags | work_static(work)); } static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq, unsigned long extra_flags) { set_work_data(work, (unsigned long)pwq, WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | extra_flags); } static void set_work_pool_and_keep_pending(struct work_struct *work, int pool_id) { set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, WORK_STRUCT_PENDING); } static void set_work_pool_and_clear_pending(struct work_struct *work, int pool_id) { /* * The following wmb is paired with the implied mb in * test_and_set_bit(PENDING) and ensures all updates to @work made * here are visible to and precede any updates by the next PENDING * owner. */ smp_wmb(); set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0); /* * The following mb guarantees that previous clear of a PENDING bit * will not be reordered with any speculative LOADS or STORES from * work->current_func, which is executed afterwards. This possible * reordering can lead to a missed execution on attempt to queue * the same @work. E.g. consider this case: * * CPU#0 CPU#1 * ---------------------------- -------------------------------- * * 1 STORE event_indicated * 2 queue_work_on() { * 3 test_and_set_bit(PENDING) * 4 } set_..._and_clear_pending() { * 5 set_work_data() # clear bit * 6 smp_mb() * 7 work->current_func() { * 8 LOAD event_indicated * } * * Without an explicit full barrier speculative LOAD on line 8 can * be executed before CPU#0 does STORE on line 1. If that happens, * CPU#0 observes the PENDING bit is still set and new execution of * a @work is not queued in a hope, that CPU#1 will eventually * finish the queued @work. Meanwhile CPU#1 does not see * event_indicated is set, because speculative LOAD was executed * before actual STORE. */ smp_mb(); } static void clear_work_data(struct work_struct *work) { smp_wmb(); /* see set_work_pool_and_clear_pending() */ set_work_data(work, WORK_STRUCT_NO_POOL, 0); } static inline struct pool_workqueue *work_struct_pwq(unsigned long data) { return (struct pool_workqueue *)(data & WORK_STRUCT_WQ_DATA_MASK); } static struct pool_workqueue *get_work_pwq(struct work_struct *work) { unsigned long data = atomic_long_read(&work->data); if (data & WORK_STRUCT_PWQ) return work_struct_pwq(data); else return NULL; } /** * get_work_pool - return the worker_pool a given work was associated with * @work: the work item of interest * * Pools are created and destroyed under wq_pool_mutex, and allows read * access under RCU read lock. As such, this function should be * called under wq_pool_mutex or inside of a rcu_read_lock() region. * * All fields of the returned pool are accessible as long as the above * mentioned locking is in effect. If the returned pool needs to be used * beyond the critical section, the caller is responsible for ensuring the * returned pool is and stays online. * * Return: The worker_pool @work was last associated with. %NULL if none. */ static struct worker_pool *get_work_pool(struct work_struct *work) { unsigned long data = atomic_long_read(&work->data); int pool_id; assert_rcu_or_pool_mutex(); if (data & WORK_STRUCT_PWQ) return work_struct_pwq(data)->pool; pool_id = data >> WORK_OFFQ_POOL_SHIFT; if (pool_id == WORK_OFFQ_POOL_NONE) return NULL; return idr_find(&worker_pool_idr, pool_id); } /** * get_work_pool_id - return the worker pool ID a given work is associated with * @work: the work item of interest * * Return: The worker_pool ID @work was last associated with. * %WORK_OFFQ_POOL_NONE if none. */ static int get_work_pool_id(struct work_struct *work) { unsigned long data = atomic_long_read(&work->data); if (data & WORK_STRUCT_PWQ) return work_struct_pwq(data)->pool->id; return data >> WORK_OFFQ_POOL_SHIFT; } static void mark_work_canceling(struct work_struct *work) { unsigned long pool_id = get_work_pool_id(work); pool_id <<= WORK_OFFQ_POOL_SHIFT; set_work_data(work, pool_id | WORK_OFFQ_CANCELING, WORK_STRUCT_PENDING); } static bool work_is_canceling(struct work_struct *work) { unsigned long data = atomic_long_read(&work->data); return !(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_CANCELING); } /* * Policy functions. These define the policies on how the global worker * pools are managed. Unless noted otherwise, these functions assume that * they're being called with pool->lock held. */ /* * Need to wake up a worker? Called from anything but currently * running workers. * * Note that, because unbound workers never contribute to nr_running, this * function will always return %true for unbound pools as long as the * worklist isn't empty. */ static bool need_more_worker(struct worker_pool *pool) { return !list_empty(&pool->worklist) && !pool->nr_running; } /* Can I start working? Called from busy but !running workers. */ static bool may_start_working(struct worker_pool *pool) { return pool->nr_idle; } /* Do I need to keep working? Called from currently running workers. */ static bool keep_working(struct worker_pool *pool) { return !list_empty(&pool->worklist) && (pool->nr_running <= 1); } /* Do we need a new worker? Called from manager. */ static bool need_to_create_worker(struct worker_pool *pool) { return need_more_worker(pool) && !may_start_working(pool); } /* Do we have too many workers and should some go away? */ static bool too_many_workers(struct worker_pool *pool) { bool managing = pool->flags & POOL_MANAGER_ACTIVE; int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ int nr_busy = pool->nr_workers - nr_idle; return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; } /** * worker_set_flags - set worker flags and adjust nr_running accordingly * @worker: self * @flags: flags to set * * Set @flags in @worker->flags and adjust nr_running accordingly. */ static inline void worker_set_flags(struct worker *worker, unsigned int flags) { struct worker_pool *pool = worker->pool; lockdep_assert_held(&pool->lock); /* If transitioning into NOT_RUNNING, adjust nr_running. */ if ((flags & WORKER_NOT_RUNNING) && !(worker->flags & WORKER_NOT_RUNNING)) { pool->nr_running--; } worker->flags |= flags; } /** * worker_clr_flags - clear worker flags and adjust nr_running accordingly * @worker: self * @flags: flags to clear * * Clear @flags in @worker->flags and adjust nr_running accordingly. */ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) { struct worker_pool *pool = worker->pool; unsigned int oflags = worker->flags; lockdep_assert_held(&pool->lock); worker->flags &= ~flags; /* * If transitioning out of NOT_RUNNING, increment nr_running. Note * that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask * of multiple flags, not a single flag. */ if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) if (!(worker->flags & WORKER_NOT_RUNNING)) pool->nr_running++; } /* Return the first idle worker. Called with pool->lock held. */ static struct worker *first_idle_worker(struct worker_pool *pool) { if (unlikely(list_empty(&pool->idle_list))) return NULL; return list_first_entry(&pool->idle_list, struct worker, entry); } /** * worker_enter_idle - enter idle state * @worker: worker which is entering idle state * * @worker is entering idle state. Update stats and idle timer if * necessary. * * LOCKING: * raw_spin_lock_irq(pool->lock). */ static void worker_enter_idle(struct worker *worker) { struct worker_pool *pool = worker->pool; if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) || WARN_ON_ONCE(!list_empty(&worker->entry) && (worker->hentry.next || worker->hentry.pprev))) return; /* can't use worker_set_flags(), also called from create_worker() */ worker->flags |= WORKER_IDLE; pool->nr_idle++; worker->last_active = jiffies; /* idle_list is LIFO */ list_add(&worker->entry, &pool->idle_list); if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); /* Sanity check nr_running. */ WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running); } /** * worker_leave_idle - leave idle state * @worker: worker which is leaving idle state * * @worker is leaving idle state. Update stats. * * LOCKING: * raw_spin_lock_irq(pool->lock). */ static void worker_leave_idle(struct worker *worker) { struct worker_pool *pool = worker->pool; if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE))) return; worker_clr_flags(worker, WORKER_IDLE); pool->nr_idle--; list_del_init(&worker->entry); } /** * find_worker_executing_work - find worker which is executing a work * @pool: pool of interest * @work: work to find worker for * * Find a worker which is executing @work on @pool by searching * @pool->busy_hash which is keyed by the address of @work. For a worker * to match, its current execution should match the address of @work and * its work function. This is to avoid unwanted dependency between * unrelated work executions through a work item being recycled while still * being executed. * * This is a bit tricky. A work item may be freed once its execution * starts and nothing prevents the freed area from being recycled for * another work item. If the same work item address ends up being reused * before the original execution finishes, workqueue will identify the * recycled work item as currently executing and make it wait until the * current execution finishes, introducing an unwanted dependency. * * This function checks the work item address and work function to avoid * false positives. Note that this isn't complete as one may construct a * work function which can introduce dependency onto itself through a * recycled work item. Well, if somebody wants to shoot oneself in the * foot that badly, there's only so much we can do, and if such deadlock * actually occurs, it should be easy to locate the culprit work function. * * CONTEXT: * raw_spin_lock_irq(pool->lock). * * Return: * Pointer to worker which is executing @work if found, %NULL * otherwise. */ static struct worker *find_worker_executing_work(struct worker_pool *pool, struct work_struct *work) { struct worker *worker; hash_for_each_possible(pool->busy_hash, worker, hentry, (unsigned long)work) if (worker->current_work == work && worker->current_func == work->func) return worker; return NULL; } /** * move_linked_works - move linked works to a list * @work: start of series of works to be scheduled * @head: target list to append @work to * @nextp: out parameter for nested worklist walking * * Schedule linked works starting from @work to @head. Work series to be * scheduled starts at @work and includes any consecutive work with * WORK_STRUCT_LINKED set in its predecessor. See assign_work() for details on * @nextp. * * CONTEXT: * raw_spin_lock_irq(pool->lock). */ static void move_linked_works(struct work_struct *work, struct list_head *head, struct work_struct **nextp) { struct work_struct *n; /* * Linked worklist will always end before the end of the list, * use NULL for list head. */ list_for_each_entry_safe_from(work, n, NULL, entry) { list_move_tail(&work->entry, head); if (!(*work_data_bits(work) & WORK_STRUCT_LINKED)) break; } /* * If we're already inside safe list traversal and have moved * multiple works to the scheduled queue, the next position * needs to be updated. */ if (nextp) *nextp = n; } /** * assign_work - assign a work item and its linked work items to a worker * @work: work to assign * @worker: worker to assign to * @nextp: out parameter for nested worklist walking * * Assign @work and its linked work items to @worker. If @work is already being * executed by another worker in the same pool, it'll be punted there. * * If @nextp is not NULL, it's updated to point to the next work of the last * scheduled work. This allows assign_work() to be nested inside * list_for_each_entry_safe(). * * Returns %true if @work was successfully assigned to @worker. %false if @work * was punted to another worker already executing it. */ static bool assign_work(struct work_struct *work, struct worker *worker, struct work_struct **nextp) { struct worker_pool *pool = worker->pool; struct worker *collision; lockdep_assert_held(&pool->lock); /* * A single work shouldn't be executed concurrently by multiple workers. * __queue_work() ensures that @work doesn't jump to a different pool * while still running in the previous pool. Here, we should ensure that * @work is not executed concurrently by multiple workers from the same * pool. Check whether anyone is already processing the work. If so, * defer the work to the currently executing one. */ collision = find_worker_executing_work(pool, work); if (unlikely(collision)) { move_linked_works(work, &collision->scheduled, nextp); return false; } move_linked_works(work, &worker->scheduled, nextp); return true; } /** * kick_pool - wake up an idle worker if necessary * @pool: pool to kick * * @pool may have pending work items. Wake up worker if necessary. Returns * whether a worker was woken up. */ static bool kick_pool(struct worker_pool *pool) { struct worker *worker = first_idle_worker(pool); struct task_struct *p; lockdep_assert_held(&pool->lock); if (!need_more_worker(pool) || !worker) return false; p = worker->task; #ifdef CONFIG_SMP /* * Idle @worker is about to execute @work and waking up provides an * opportunity to migrate @worker at a lower cost by setting the task's * wake_cpu field. Let's see if we want to move @worker to improve * execution locality. * * We're waking the worker that went idle the latest and there's some * chance that @worker is marked idle but hasn't gone off CPU yet. If * so, setting the wake_cpu won't do anything. As this is a best-effort * optimization and the race window is narrow, let's leave as-is for * now. If this becomes pronounced, we can skip over workers which are * still on cpu when picking an idle worker. * * If @pool has non-strict affinity, @worker might have ended up outside * its affinity scope. Repatriate. */ if (!pool->attrs->affn_strict && !cpumask_test_cpu(p->wake_cpu, pool->attrs->__pod_cpumask)) { struct work_struct *work = list_first_entry(&pool->worklist, struct work_struct, entry); p->wake_cpu = cpumask_any_distribute(pool->attrs->__pod_cpumask); get_work_pwq(work)->stats[PWQ_STAT_REPATRIATED]++; } #endif wake_up_process(p); return true; } #ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT /* * Concurrency-managed per-cpu work items that hog CPU for longer than * wq_cpu_intensive_thresh_us trigger the automatic CPU_INTENSIVE mechanism, * which prevents them from stalling other concurrency-managed work items. If a * work function keeps triggering this mechanism, it's likely that the work item * should be using an unbound workqueue instead. * * wq_cpu_intensive_report() tracks work functions which trigger such conditions * and report them so that they can be examined and converted to use unbound * workqueues as appropriate. To avoid flooding the console, each violating work * function is tracked and reported with exponential backoff. */ #define WCI_MAX_ENTS 128 struct wci_ent { work_func_t func; atomic64_t cnt; struct hlist_node hash_node; }; static struct wci_ent wci_ents[WCI_MAX_ENTS]; static int wci_nr_ents; static DEFINE_RAW_SPINLOCK(wci_lock); static DEFINE_HASHTABLE(wci_hash, ilog2(WCI_MAX_ENTS)); static struct wci_ent *wci_find_ent(work_func_t func) { struct wci_ent *ent; hash_for_each_possible_rcu(wci_hash, ent, hash_node, (unsigned long)func) { if (ent->func == func) return ent; } return NULL; } static void wq_cpu_intensive_report(work_func_t func) { struct wci_ent *ent; restart: ent = wci_find_ent(func); if (ent) { u64 cnt; /* * Start reporting from the fourth time and back off * exponentially. */ cnt = atomic64_inc_return_relaxed(&ent->cnt); if (cnt >= 4 && is_power_of_2(cnt)) printk_deferred(KERN_WARNING "workqueue: %ps hogged CPU for >%luus %llu times, consider switching to WQ_UNBOUND\n", ent->func, wq_cpu_intensive_thresh_us, atomic64_read(&ent->cnt)); return; } /* * @func is a new violation. Allocate a new entry for it. If wcn_ents[] * is exhausted, something went really wrong and we probably made enough * noise already. */ if (wci_nr_ents >= WCI_MAX_ENTS) return; raw_spin_lock(&wci_lock); if (wci_nr_ents >= WCI_MAX_ENTS) { raw_spin_unlock(&wci_lock); return; } if (wci_find_ent(func)) { raw_spin_unlock(&wci_lock); goto restart; } ent = &wci_ents[wci_nr_ents++]; ent->func = func; atomic64_set(&ent->cnt, 1); hash_add_rcu(wci_hash, &ent->hash_node, (unsigned long)func); raw_spin_unlock(&wci_lock); } #else /* CONFIG_WQ_CPU_INTENSIVE_REPORT */ static void wq_cpu_intensive_report(work_func_t func) {} #endif /* CONFIG_WQ_CPU_INTENSIVE_REPORT */ /** * wq_worker_running - a worker is running again * @task: task waking up * * This function is called when a worker returns from schedule() */ void wq_worker_running(struct task_struct *task) { struct worker *worker = kthread_data(task); if (!READ_ONCE(worker->sleeping)) return; /* * If preempted by unbind_workers() between the WORKER_NOT_RUNNING check * and the nr_running increment below, we may ruin the nr_running reset * and leave with an unexpected pool->nr_running == 1 on the newly unbound * pool. Protect against such race. */ preempt_disable(); if (!(worker->flags & WORKER_NOT_RUNNING)) worker->pool->nr_running++; preempt_enable(); /* * CPU intensive auto-detection cares about how long a work item hogged * CPU without sleeping. Reset the starting timestamp on wakeup. */ worker->current_at = worker->task->se.sum_exec_runtime; WRITE_ONCE(worker->sleeping, 0); } /** * wq_worker_sleeping - a worker is going to sleep * @task: task going to sleep * * This function is called from schedule() when a busy worker is * going to sleep. */ void wq_worker_sleeping(struct task_struct *task) { struct worker *worker = kthread_data(task); struct worker_pool *pool; /* * Rescuers, which may not have all the fields set up like normal * workers, also reach here, let's not access anything before * checking NOT_RUNNING. */ if (worker->flags & WORKER_NOT_RUNNING) return; pool = worker->pool; /* Return if preempted before wq_worker_running() was reached */ if (READ_ONCE(worker->sleeping)) return; WRITE_ONCE(worker->sleeping, 1); raw_spin_lock_irq(&pool->lock); /* * Recheck in case unbind_workers() preempted us. We don't * want to decrement nr_running after the worker is unbound * and nr_running has been reset. */ if (worker->flags & WORKER_NOT_RUNNING) { raw_spin_unlock_irq(&pool->lock); return; } pool->nr_running--; if (kick_pool(pool)) worker->current_pwq->stats[PWQ_STAT_CM_WAKEUP]++; raw_spin_unlock_irq(&pool->lock); } /** * wq_worker_tick - a scheduler tick occurred while a kworker is running * @task: task currently running * * Called from scheduler_tick(). We're in the IRQ context and the current * worker's fields which follow the 'K' locking rule can be accessed safely. */ void wq_worker_tick(struct task_struct *task) { struct worker *worker = kthread_data(task); struct pool_workqueue *pwq = worker->current_pwq; struct worker_pool *pool = worker->pool; if (!pwq) return; pwq->stats[PWQ_STAT_CPU_TIME] += TICK_USEC; if (!wq_cpu_intensive_thresh_us) return; /* * If the current worker is concurrency managed and hogged the CPU for * longer than wq_cpu_intensive_thresh_us, it's automatically marked * CPU_INTENSIVE to avoid stalling other concurrency-managed work items. * * Set @worker->sleeping means that @worker is in the process of * switching out voluntarily and won't be contributing to * @pool->nr_running until it wakes up. As wq_worker_sleeping() also * decrements ->nr_running, setting CPU_INTENSIVE here can lead to * double decrements. The task is releasing the CPU anyway. Let's skip. * We probably want to make this prettier in the future. */ if ((worker->flags & WORKER_NOT_RUNNING) || READ_ONCE(worker->sleeping) || worker->task->se.sum_exec_runtime - worker->current_at < wq_cpu_intensive_thresh_us * NSEC_PER_USEC) return; raw_spin_lock(&pool->lock); worker_set_flags(worker, WORKER_CPU_INTENSIVE); wq_cpu_intensive_report(worker->current_func); pwq->stats[PWQ_STAT_CPU_INTENSIVE]++; if (kick_pool(pool)) pwq->stats[PWQ_STAT_CM_WAKEUP]++; raw_spin_unlock(&pool->lock); } /** * wq_worker_last_func - retrieve worker's last work function * @task: Task to retrieve last work function of. * * Determine the last function a worker executed. This is called from * the scheduler to get a worker's last known identity. * * CONTEXT: * raw_spin_lock_irq(rq->lock) * * This function is called during schedule() when a kworker is going * to sleep. It's used by psi to identify aggregation workers during * dequeuing, to allow periodic aggregation to shut-off when that * worker is the last task in the system or cgroup to go to sleep. * * As this function doesn't involve any workqueue-related locking, it * only returns stable values when called from inside the scheduler's * queuing and dequeuing paths, when @task, which must be a kworker, * is guaranteed to not be processing any works. * * Return: * The last work function %current executed as a worker, NULL if it * hasn't executed any work yet. */ work_func_t wq_worker_last_func(struct task_struct *task) { struct worker *worker = kthread_data(task); return worker->last_func; } /** * get_pwq - get an extra reference on the specified pool_workqueue * @pwq: pool_workqueue to get * * Obtain an extra reference on @pwq. The caller should guarantee that * @pwq has positive refcnt and be holding the matching pool->lock. */ static void get_pwq(struct pool_workqueue *pwq) { lockdep_assert_held(&pwq->pool->lock); WARN_ON_ONCE(pwq->refcnt <= 0); pwq->refcnt++; } /** * put_pwq - put a pool_workqueue reference * @pwq: pool_workqueue to put * * Drop a reference of @pwq. If its refcnt reaches zero, schedule its * destruction. The caller should be holding the matching pool->lock. */ static void put_pwq(struct pool_workqueue *pwq) { lockdep_assert_held(&pwq->pool->lock); if (likely(--pwq->refcnt)) return; /* * @pwq can't be released under pool->lock, bounce to a dedicated * kthread_worker to avoid A-A deadlocks. */ kthread_queue_work(pwq_release_worker, &pwq->release_work); } /** * put_pwq_unlocked - put_pwq() with surrounding pool lock/unlock * @pwq: pool_workqueue to put (can be %NULL) * * put_pwq() with locking. This function also allows %NULL @pwq. */ static void put_pwq_unlocked(struct pool_workqueue *pwq) { if (pwq) { /* * As both pwqs and pools are RCU protected, the * following lock operations are safe. */ raw_spin_lock_irq(&pwq->pool->lock); put_pwq(pwq); raw_spin_unlock_irq(&pwq->pool->lock); } } static void pwq_activate_inactive_work(struct work_struct *work) { struct pool_workqueue *pwq = get_work_pwq(work); trace_workqueue_activate_work(work); if (list_empty(&pwq->pool->worklist)) pwq->pool->watchdog_ts = jiffies; move_linked_works(work, &pwq->pool->worklist, NULL); __clear_bit(WORK_STRUCT_INACTIVE_BIT, work_data_bits(work)); pwq->nr_active++; } static void pwq_activate_first_inactive(struct pool_workqueue *pwq) { struct work_struct *work = list_first_entry(&pwq->inactive_works, struct work_struct, entry); pwq_activate_inactive_work(work); } /** * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight * @pwq: pwq of interest * @work_data: work_data of work which left the queue * * A work either has completed or is removed from pending queue, * decrement nr_in_flight of its pwq and handle workqueue flushing. * * CONTEXT: * raw_spin_lock_irq(pool->lock). */ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, unsigned long work_data) { int color = get_work_color(work_data); if (!(work_data & WORK_STRUCT_INACTIVE)) { pwq->nr_active--; if (!list_empty(&pwq->inactive_works)) { /* one down, submit an inactive one */ if (pwq->nr_active < pwq->max_active) pwq_activate_first_inactive(pwq); } } pwq->nr_in_flight[color]--; /* is flush in progress and are we at the flushing tip? */ if (likely(pwq->flush_color != color)) goto out_put; /* are there still in-flight works? */ if (pwq->nr_in_flight[color]) goto out_put; /* this pwq is done, clear flush_color */ pwq->flush_color = -1; /* * If this was the last pwq, wake up the first flusher. It * will handle the rest. */ if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush)) complete(&pwq->wq->first_flusher->done); out_put: put_pwq(pwq); } /** * try_to_grab_pending - steal work item from worklist and disable irq * @work: work item to steal * @is_dwork: @work is a delayed_work * @flags: place to store irq state * * Try to grab PENDING bit of @work. This function can handle @work in any * stable state - idle, on timer or on worklist. * * Return: * * ======== ================================================================ * 1 if @work was pending and we successfully stole PENDING * 0 if @work was idle and we claimed PENDING * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry * -ENOENT if someone else is canceling @work, this state may persist * for arbitrarily long * ======== ================================================================ * * Note: * On >= 0 return, the caller owns @work's PENDING bit. To avoid getting * interrupted while holding PENDING and @work off queue, irq must be * disabled on entry. This, combined with delayed_work->timer being * irqsafe, ensures that we return -EAGAIN for finite short period of time. * * On successful return, >= 0, irq is disabled and the caller is * responsible for releasing it using local_irq_restore(*@flags). * * This function is safe to call from any context including IRQ handler. */ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, unsigned long *flags) { struct worker_pool *pool; struct pool_workqueue *pwq; local_irq_save(*flags); /* try to steal the timer if it exists */ if (is_dwork) { struct delayed_work *dwork = to_delayed_work(work); /* * dwork->timer is irqsafe. If del_timer() fails, it's * guaranteed that the timer is not queued anywhere and not * running on the local CPU. */ if (likely(del_timer(&dwork->timer))) return 1; } /* try to claim PENDING the normal way */ if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) return 0; rcu_read_lock(); /* * The queueing is in progress, or it is already queued. Try to * steal it from ->worklist without clearing WORK_STRUCT_PENDING. */ pool = get_work_pool(work); if (!pool) goto fail; raw_spin_lock(&pool->lock); /* * work->data is guaranteed to point to pwq only while the work * item is queued on pwq->wq, and both updating work->data to point * to pwq on queueing and to pool on dequeueing are done under * pwq->pool->lock. This in turn guarantees that, if work->data * points to pwq which is associated with a locked pool, the work * item is currently queued on that pool. */ pwq = get_work_pwq(work); if (pwq && pwq->pool == pool) { debug_work_deactivate(work); /* * A cancelable inactive work item must be in the * pwq->inactive_works since a queued barrier can't be * canceled (see the comments in insert_wq_barrier()). * * An inactive work item cannot be grabbed directly because * it might have linked barrier work items which, if left * on the inactive_works list, will confuse pwq->nr_active * management later on and cause stall. Make sure the work * item is activated before grabbing. */ if (*work_data_bits(work) & WORK_STRUCT_INACTIVE) pwq_activate_inactive_work(work); list_del_init(&work->entry); pwq_dec_nr_in_flight(pwq, *work_data_bits(work)); /* work->data points to pwq iff queued, point to pool */ set_work_pool_and_keep_pending(work, pool->id); raw_spin_unlock(&pool->lock); rcu_read_unlock(); return 1; } raw_spin_unlock(&pool->lock); fail: rcu_read_unlock(); local_irq_restore(*flags); if (work_is_canceling(work)) return -ENOENT; cpu_relax(); return -EAGAIN; } /** * insert_work - insert a work into a pool * @pwq: pwq @work belongs to * @work: work to insert * @head: insertion point * @extra_flags: extra WORK_STRUCT_* flags to set * * Insert @work which belongs to @pwq after @head. @extra_flags is or'd to * work_struct flags. * * CONTEXT: * raw_spin_lock_irq(pool->lock). */ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, struct list_head *head, unsigned int extra_flags) { debug_work_activate(work); /* record the work call stack in order to print it in KASAN reports */ kasan_record_aux_stack_noalloc(work); /* we own @work, set data and link */ set_work_pwq(work, pwq, extra_flags); list_add_tail(&work->entry, head); get_pwq(pwq); } /* * Test whether @work is being queued from another work executing on the * same workqueue. */ static bool is_chained_work(struct workqueue_struct *wq) { struct worker *worker; worker = current_wq_worker(); /* * Return %true iff I'm a worker executing a work item on @wq. If * I'm @worker, it's safe to dereference it without locking. */ return worker && worker->current_pwq->wq == wq; } /* * When queueing an unbound work item to a wq, prefer local CPU if allowed * by wq_unbound_cpumask. Otherwise, round robin among the allowed ones to * avoid perturbing sensitive tasks. */ static int wq_select_unbound_cpu(int cpu) { int new_cpu; if (likely(!wq_debug_force_rr_cpu)) { if (cpumask_test_cpu(cpu, wq_unbound_cpumask)) return cpu; } else { pr_warn_once("workqueue: round-robin CPU selection forced, expect performance impact\n"); } if (cpumask_empty(wq_unbound_cpumask)) return cpu; new_cpu = __this_cpu_read(wq_rr_cpu_last); new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask); if (unlikely(new_cpu >= nr_cpu_ids)) { new_cpu = cpumask_first_and(wq_unbound_cpumask, cpu_online_mask); if (unlikely(new_cpu >= nr_cpu_ids)) return cpu; } __this_cpu_write(wq_rr_cpu_last, new_cpu); return new_cpu; } static void __queue_work(int cpu, struct workqueue_struct *wq, struct work_struct *work) { struct pool_workqueue *pwq; struct worker_pool *last_pool, *pool; unsigned int work_flags; unsigned int req_cpu = cpu; /* * While a work item is PENDING && off queue, a task trying to * steal the PENDING will busy-loop waiting for it to either get * queued or lose PENDING. Grabbing PENDING and queueing should * happen with IRQ disabled. */ lockdep_assert_irqs_disabled(); /* * For a draining wq, only works from the same workqueue are * allowed. The __WQ_DESTROYING helps to spot the issue that * queues a new work item to a wq after destroy_workqueue(wq). */ if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) && WARN_ON_ONCE(!is_chained_work(wq)))) return; rcu_read_lock(); retry: /* pwq which will be used unless @work is executing elsewhere */ if (req_cpu == WORK_CPU_UNBOUND) { if (wq->flags & WQ_UNBOUND) cpu = wq_select_unbound_cpu(raw_smp_processor_id()); else cpu = raw_smp_processor_id(); } pwq = rcu_dereference(*per_cpu_ptr(wq->cpu_pwq, cpu)); pool = pwq->pool; /* * If @work was previously on a different pool, it might still be * running there, in which case the work needs to be queued on that * pool to guarantee non-reentrancy. */ last_pool = get_work_pool(work); if (last_pool && last_pool != pool) { struct worker *worker; raw_spin_lock(&last_pool->lock); worker = find_worker_executing_work(last_pool, work); if (worker && worker->current_pwq->wq == wq) { pwq = worker->current_pwq; pool = pwq->pool; WARN_ON_ONCE(pool != last_pool); } else { /* meh... not running there, queue here */ raw_spin_unlock(&last_pool->lock); raw_spin_lock(&pool->lock); } } else { raw_spin_lock(&pool->lock); } /* * pwq is determined and locked. For unbound pools, we could have raced * with pwq release and it could already be dead. If its refcnt is zero, * repeat pwq selection. Note that unbound pwqs never die without * another pwq replacing it in cpu_pwq or while work items are executing * on it, so the retrying is guaranteed to make forward-progress. */ if (unlikely(!pwq->refcnt)) { if (wq->flags & WQ_UNBOUND) { raw_spin_unlock(&pool->lock); cpu_relax(); goto retry; } /* oops */ WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt", wq->name, cpu); } /* pwq determined, queue */ trace_workqueue_queue_work(req_cpu, pwq, work); if (WARN_ON(!list_empty(&work->entry))) goto out; pwq->nr_in_flight[pwq->work_color]++; work_flags = work_color_to_flags(pwq->work_color); if (likely(pwq->nr_active < pwq->max_active)) { if (list_empty(&pool->worklist)) pool->watchdog_ts = jiffies; trace_workqueue_activate_work(work); pwq->nr_active++; insert_work(pwq, work, &pool->worklist, work_flags); kick_pool(pool); } else { work_flags |= WORK_STRUCT_INACTIVE; insert_work(pwq, work, &pwq->inactive_works, work_flags); } out: raw_spin_unlock(&pool->lock); rcu_read_unlock(); } /** * queue_work_on - queue work on specific cpu * @cpu: CPU number to execute work on * @wq: workqueue to use * @work: work to queue * * We queue the work to a specific CPU, the caller must ensure it * can't go away. Callers that fail to ensure that the specified * CPU cannot go away will execute on a randomly chosen CPU. * But note well that callers specifying a CPU that never has been * online will get a splat. * * Return: %false if @work was already on a queue, %true otherwise. */ bool queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work) { bool ret = false; unsigned long flags; local_irq_save(flags); if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { __queue_work(cpu, wq, work); ret = true; } local_irq_restore(flags); return ret; } EXPORT_SYMBOL(queue_work_on); /** * select_numa_node_cpu - Select a CPU based on NUMA node * @node: NUMA node ID that we want to select a CPU from * * This function will attempt to find a "random" cpu available on a given * node. If there are no CPUs available on the given node it will return * WORK_CPU_UNBOUND indicating that we should just schedule to any * available CPU if we need to schedule this work. */ static int select_numa_node_cpu(int node) { int cpu; /* Delay binding to CPU if node is not valid or online */ if (node < 0 || node >= MAX_NUMNODES || !node_online(node)) return WORK_CPU_UNBOUND; /* Use local node/cpu if we are already there */ cpu = raw_smp_processor_id(); if (node == cpu_to_node(cpu)) return cpu; /* Use "random" otherwise know as "first" online CPU of node */ cpu = cpumask_any_and(cpumask_of_node(node), cpu_online_mask); /* If CPU is valid return that, otherwise just defer */ return cpu < nr_cpu_ids ? cpu : WORK_CPU_UNBOUND; } /** * queue_work_node - queue work on a "random" cpu for a given NUMA node * @node: NUMA node that we are targeting the work for * @wq: workqueue to use * @work: work to queue * * We queue the work to a "random" CPU within a given NUMA node. The basic * idea here is to provide a way to somehow associate work with a given * NUMA node. * * This function will only make a best effort attempt at getting this onto * the right NUMA node. If no node is requested or the requested node is * offline then we just fall back to standard queue_work behavior. * * Currently the "random" CPU ends up being the first available CPU in the * intersection of cpu_online_mask and the cpumask of the node, unless we * are running on the node. In that case we just use the current CPU. * * Return: %false if @work was already on a queue, %true otherwise. */ bool queue_work_node(int node, struct workqueue_struct *wq, struct work_struct *work) { unsigned long flags; bool ret = false; /* * This current implementation is specific to unbound workqueues. * Specifically we only return the first available CPU for a given * node instead of cycling through individual CPUs within the node. * * If this is used with a per-cpu workqueue then the logic in * workqueue_select_cpu_near would need to be updated to allow for * some round robin type logic. */ WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)); local_irq_save(flags); if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { int cpu = select_numa_node_cpu(node); __queue_work(cpu, wq, work); ret = true; } local_irq_restore(flags); return ret; } EXPORT_SYMBOL_GPL(queue_work_node); void delayed_work_timer_fn(struct timer_list *t) { struct delayed_work *dwork = from_timer(dwork, t, timer); /* should have been called from irqsafe timer with irq already off */ __queue_work(dwork->cpu, dwork->wq, &dwork->work); } EXPORT_SYMBOL(delayed_work_timer_fn); static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) { struct timer_list *timer = &dwork->timer; struct work_struct *work = &dwork->work; WARN_ON_ONCE(!wq); WARN_ON_ONCE(timer->function != delayed_work_timer_fn); WARN_ON_ONCE(timer_pending(timer)); WARN_ON_ONCE(!list_empty(&work->entry)); /* * If @delay is 0, queue @dwork->work immediately. This is for * both optimization and correctness. The earliest @timer can * expire is on the closest next tick and delayed_work users depend * on that there's no such delay when @delay is 0. */ if (!delay) { __queue_work(cpu, wq, &dwork->work); return; } dwork->wq = wq; dwork->cpu = cpu; timer->expires = jiffies + delay; if (unlikely(cpu != WORK_CPU_UNBOUND)) add_timer_on(timer, cpu); else add_timer(timer); } /** * queue_delayed_work_on - queue work on specific CPU after delay * @cpu: CPU number to execute work on * @wq: workqueue to use * @dwork: work to queue * @delay: number of jiffies to wait before queueing * * Return: %false if @work was already on a queue, %true otherwise. If * @delay is zero and @dwork is idle, it will be scheduled for immediate * execution. */ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) { struct work_struct *work = &dwork->work; bool ret = false; unsigned long flags; /* read the comment in __queue_work() */ local_irq_save(flags); if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { __queue_delayed_work(cpu, wq, dwork, delay); ret = true; } local_irq_restore(flags); return ret; } EXPORT_SYMBOL(queue_delayed_work_on); /** * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU * @cpu: CPU number to execute work on * @wq: workqueue to use * @dwork: work to queue * @delay: number of jiffies to wait before queueing * * If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise, * modify @dwork's timer so that it expires after @delay. If @delay is * zero, @work is guaranteed to be scheduled immediately regardless of its * current state. * * Return: %false if @dwork was idle and queued, %true if @dwork was * pending and its timer was modified. * * This function is safe to call from any context including IRQ handler. * See try_to_grab_pending() for details. */ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) { unsigned long flags; int ret; do { ret = try_to_grab_pending(&dwork->work, true, &flags); } while (unlikely(ret == -EAGAIN)); if (likely(ret >= 0)) { __queue_delayed_work(cpu, wq, dwork, delay); local_irq_restore(flags); } /* -ENOENT from try_to_grab_pending() becomes %true */ return ret; } EXPORT_SYMBOL_GPL(mod_delayed_work_on); static void rcu_work_rcufn(struct rcu_head *rcu) { struct rcu_work *rwork = container_of(rcu, struct rcu_work, rcu); /* read the comment in __queue_work() */ local_irq_disable(); __queue_work(WORK_CPU_UNBOUND, rwork->wq, &rwork->work); local_irq_enable(); } /** * queue_rcu_work - queue work after a RCU grace period * @wq: workqueue to use * @rwork: work to queue * * Return: %false if @rwork was already pending, %true otherwise. Note * that a full RCU grace period is guaranteed only after a %true return. * While @rwork is guaranteed to be executed after a %false return, the * execution may happen before a full RCU grace period has passed. */ bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork) { struct work_struct *work = &rwork->work; if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { rwork->wq = wq; call_rcu_hurry(&rwork->rcu, rcu_work_rcufn); return true; } return false; } EXPORT_SYMBOL(queue_rcu_work); static struct worker *alloc_worker(int node) { struct worker *worker; worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, node); if (worker) { INIT_LIST_HEAD(&worker->entry); INIT_LIST_HEAD(&worker->scheduled); INIT_LIST_HEAD(&worker->node); /* on creation a worker is in !idle && prep state */ worker->flags = WORKER_PREP; } return worker; } static cpumask_t *pool_allowed_cpus(struct worker_pool *pool) { if (pool->cpu < 0 && pool->attrs->affn_strict) return pool->attrs->__pod_cpumask; else return pool->attrs->cpumask; } /** * worker_attach_to_pool() - attach a worker to a pool * @worker: worker to be attached * @pool: the target pool * * Attach @worker to @pool. Once attached, the %WORKER_UNBOUND flag and * cpu-binding of @worker are kept coordinated with the pool across * cpu-[un]hotplugs. */ static void worker_attach_to_pool(struct worker *worker, struct worker_pool *pool) { mutex_lock(&wq_pool_attach_mutex); /* * The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains * stable across this function. See the comments above the flag * definition for details. */ if (pool->flags & POOL_DISASSOCIATED) worker->flags |= WORKER_UNBOUND; else kthread_set_per_cpu(worker->task, pool->cpu); if (worker->rescue_wq) set_cpus_allowed_ptr(worker->task, pool_allowed_cpus(pool)); list_add_tail(&worker->node, &pool->workers); worker->pool = pool; mutex_unlock(&wq_pool_attach_mutex); } /** * worker_detach_from_pool() - detach a worker from its pool * @worker: worker which is attached to its pool * * Undo the attaching which had been done in worker_attach_to_pool(). The * caller worker shouldn't access to the pool after detached except it has * other reference to the pool. */ static void worker_detach_from_pool(struct worker *worker) { struct worker_pool *pool = worker->pool; struct completion *detach_completion = NULL; mutex_lock(&wq_pool_attach_mutex); kthread_set_per_cpu(worker->task, -1); list_del(&worker->node); worker->pool = NULL; if (list_empty(&pool->workers) && list_empty(&pool->dying_workers)) detach_completion = pool->detach_completion; mutex_unlock(&wq_pool_attach_mutex); /* clear leftover flags without pool->lock after it is detached */ worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND); if (detach_completion) complete(detach_completion); } /** * create_worker - create a new workqueue worker * @pool: pool the new worker will belong to * * Create and start a new worker which is attached to @pool. * * CONTEXT: * Might sleep. Does GFP_KERNEL allocations. * * Return: * Pointer to the newly created worker. */ static struct worker *create_worker(struct worker_pool *pool) { struct worker *worker; int id; char id_buf[23]; /* ID is needed to determine kthread name */ id = ida_alloc(&pool->worker_ida, GFP_KERNEL); if (id < 0) { pr_err_once("workqueue: Failed to allocate a worker ID: %pe\n", ERR_PTR(id)); return NULL; } worker = alloc_worker(pool->node); if (!worker) { pr_err_once("workqueue: Failed to allocate a worker\n"); goto fail; } worker->id = id; if (pool->cpu >= 0) snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id, pool->attrs->nice < 0 ? "H" : ""); else snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id); worker->task = kthread_create_on_node(worker_thread, worker, pool->node, "kworker/%s", id_buf); if (IS_ERR(worker->task)) { if (PTR_ERR(worker->task) == -EINTR) { pr_err("workqueue: Interrupted when creating a worker thread \"kworker/%s\"\n", id_buf); } else { pr_err_once("workqueue: Failed to create a worker thread: %pe", worker->task); } goto fail; } set_user_nice(worker->task, pool->attrs->nice); kthread_bind_mask(worker->task, pool_allowed_cpus(pool)); /* successful, attach the worker to the pool */ worker_attach_to_pool(worker, pool); /* start the newly created worker */ raw_spin_lock_irq(&pool->lock); worker->pool->nr_workers++; worker_enter_idle(worker); kick_pool(pool); /* * @worker is waiting on a completion in kthread() and will trigger hung * check if not woken up soon. As kick_pool() might not have waken it * up, wake it up explicitly once more. */ wake_up_process(worker->task); raw_spin_unlock_irq(&pool->lock); return worker; fail: ida_free(&pool->worker_ida, id); kfree(worker); return NULL; } static void unbind_worker(struct worker *worker) { lockdep_assert_held(&wq_pool_attach_mutex); kthread_set_per_cpu(worker->task, -1); if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask)) WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0); else WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0); } static void wake_dying_workers(struct list_head *cull_list) { struct worker *worker, *tmp; list_for_each_entry_safe(worker, tmp, cull_list, entry) { list_del_init(&worker->entry); unbind_worker(worker); /* * If the worker was somehow already running, then it had to be * in pool->idle_list when set_worker_dying() happened or we * wouldn't have gotten here. * * Thus, the worker must either have observed the WORKER_DIE * flag, or have set its state to TASK_IDLE. Either way, the * below will be observed by the worker and is safe to do * outside of pool->lock. */ wake_up_process(worker->task); } } /** * set_worker_dying - Tag a worker for destruction * @worker: worker to be destroyed * @list: transfer worker away from its pool->idle_list and into list * * Tag @worker for destruction and adjust @pool stats accordingly. The worker * should be idle. * * CONTEXT: * raw_spin_lock_irq(pool->lock). */ static void set_worker_dying(struct worker *worker, struct list_head *list) { struct worker_pool *pool = worker->pool; lockdep_assert_held(&pool->lock); lockdep_assert_held(&wq_pool_attach_mutex); /* sanity check frenzy */ if (WARN_ON(worker->current_work) || WARN_ON(!list_empty(&worker->scheduled)) || WARN_ON(!(worker->flags & WORKER_IDLE))) return; pool->nr_workers--; pool->nr_idle--; worker->flags |= WORKER_DIE; list_move(&worker->entry, list); list_move(&worker->node, &pool->dying_workers); } /** * idle_worker_timeout - check if some idle workers can now be deleted. * @t: The pool's idle_timer that just expired * * The timer is armed in worker_enter_idle(). Note that it isn't disarmed in * worker_leave_idle(), as a worker flicking between idle and active while its * pool is at the too_many_workers() tipping point would cause too much timer * housekeeping overhead. Since IDLE_WORKER_TIMEOUT is long enough, we just let * it expire and re-evaluate things from there. */ static void idle_worker_timeout(struct timer_list *t) { struct worker_pool *pool = from_timer(pool, t, idle_timer); bool do_cull = false; if (work_pending(&pool->idle_cull_work)) return; raw_spin_lock_irq(&pool->lock); if (too_many_workers(pool)) { struct worker *worker; unsigned long expires; /* idle_list is kept in LIFO order, check the last one */ worker = list_entry(pool->idle_list.prev, struct worker, entry); expires = worker->last_active + IDLE_WORKER_TIMEOUT; do_cull = !time_before(jiffies, expires); if (!do_cull) mod_timer(&pool->idle_timer, expires); } raw_spin_unlock_irq(&pool->lock); if (do_cull) queue_work(system_unbound_wq, &pool->idle_cull_work); } /** * idle_cull_fn - cull workers that have been idle for too long. * @work: the pool's work for handling these idle workers * * This goes through a pool's idle workers and gets rid of those that have been * idle for at least IDLE_WORKER_TIMEOUT seconds. * * We don't want to disturb isolated CPUs because of a pcpu kworker being * culled, so this also resets worker affinity. This requires a sleepable * context, hence the split between timer callback and work item. */ static void idle_cull_fn(struct work_struct *work) { struct worker_pool *pool = container_of(work, struct worker_pool, idle_cull_work); LIST_HEAD(cull_list); /* * Grabbing wq_pool_attach_mutex here ensures an already-running worker * cannot proceed beyong worker_detach_from_pool() in its self-destruct * path. This is required as a previously-preempted worker could run after * set_worker_dying() has happened but before wake_dying_workers() did. */ mutex_lock(&wq_pool_attach_mutex); raw_spin_lock_irq(&pool->lock); while (too_many_workers(pool)) { struct worker *worker; unsigned long expires; worker = list_entry(pool->idle_list.prev, struct worker, entry); expires = worker->last_active + IDLE_WORKER_TIMEOUT; if (time_before(jiffies, expires)) { mod_timer(&pool->idle_timer, expires); break; } set_worker_dying(worker, &cull_list); } raw_spin_unlock_irq(&pool->lock); wake_dying_workers(&cull_list); mutex_unlock(&wq_pool_attach_mutex); } static void send_mayday(struct work_struct *work) { struct pool_workqueue *pwq = get_work_pwq(work); struct workqueue_struct *wq = pwq->wq; lockdep_assert_held(&wq_mayday_lock); if (!wq->rescuer) return; /* mayday mayday mayday */ if (list_empty(&pwq->mayday_node)) { /* * If @pwq is for an unbound wq, its base ref may be put at * any time due to an attribute change. Pin @pwq until the * rescuer is done with it. */ get_pwq(pwq); list_add_tail(&pwq->mayday_node, &wq->maydays); wake_up_process(wq->rescuer->task); pwq->stats[PWQ_STAT_MAYDAY]++; } } static void pool_mayday_timeout(struct timer_list *t) { struct worker_pool *pool = from_timer(pool, t, mayday_timer); struct work_struct *work; raw_spin_lock_irq(&pool->lock); raw_spin_lock(&wq_mayday_lock); /* for wq->maydays */ if (need_to_create_worker(pool)) { /* * We've been trying to create a new worker but * haven't been successful. We might be hitting an * allocation deadlock. Send distress signals to * rescuers. */ list_for_each_entry(work, &pool->worklist, entry) send_mayday(work); } raw_spin_unlock(&wq_mayday_lock); raw_spin_unlock_irq(&pool->lock); mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); } /** * maybe_create_worker - create a new worker if necessary * @pool: pool to create a new worker for * * Create a new worker for @pool if necessary. @pool is guaranteed to * have at least one idle worker on return from this function. If * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is * sent to all rescuers with works scheduled on @pool to resolve * possible allocation deadlock. * * On return, need_to_create_worker() is guaranteed to be %false and * may_start_working() %true. * * LOCKING: * raw_spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. Does GFP_KERNEL allocations. Called only from * manager. */ static void maybe_create_worker(struct worker_pool *pool) __releases(&pool->lock) __acquires(&pool->lock) { restart: raw_spin_unlock_irq(&pool->lock); /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); while (true) { if (create_worker(pool) || !need_to_create_worker(pool)) break; schedule_timeout_interruptible(CREATE_COOLDOWN); if (!need_to_create_worker(pool)) break; } del_timer_sync(&pool->mayday_timer); raw_spin_lock_irq(&pool->lock); /* * This is necessary even after a new worker was just successfully * created as @pool->lock was dropped and the new worker might have * already become busy. */ if (need_to_create_worker(pool)) goto restart; } /** * manage_workers - manage worker pool * @worker: self * * Assume the manager role and manage the worker pool @worker belongs * to. At any given time, there can be only zero or one manager per * pool. The exclusion is handled automatically by this function. * * The caller can safely start processing works on false return. On * true return, it's guaranteed that need_to_create_worker() is false * and may_start_working() is true. * * CONTEXT: * raw_spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. Does GFP_KERNEL allocations. * * Return: * %false if the pool doesn't need management and the caller can safely * start processing works, %true if management function was performed and * the conditions that the caller verified before calling the function may * no longer be true. */ static bool manage_workers(struct worker *worker) { struct worker_pool *pool = worker->pool; if (pool->flags & POOL_MANAGER_ACTIVE) return false; pool->flags |= POOL_MANAGER_ACTIVE; pool->manager = worker; maybe_create_worker(pool); pool->manager = NULL; pool->flags &= ~POOL_MANAGER_ACTIVE; rcuwait_wake_up(&manager_wait); return true; } /** * process_one_work - process single work * @worker: self * @work: work to process * * Process @work. This function contains all the logics necessary to * process a single work including synchronization against and * interaction with other workers on the same cpu, queueing and * flushing. As long as context requirement is met, any worker can * call this function to process a work. * * CONTEXT: * raw_spin_lock_irq(pool->lock) which is released and regrabbed. */ static void process_one_work(struct worker *worker, struct work_struct *work) __releases(&pool->lock) __acquires(&pool->lock) { struct pool_workqueue *pwq = get_work_pwq(work); struct worker_pool *pool = worker->pool; unsigned long work_data; #ifdef CONFIG_LOCKDEP /* * It is permissible to free the struct work_struct from * inside the function that is called from it, this we need to * take into account for lockdep too. To avoid bogus "held * lock freed" warnings as well as problems when looking into * work->lockdep_map, make a copy and use that here. */ struct lockdep_map lockdep_map; lockdep_copy_map(&lockdep_map, &work->lockdep_map); #endif /* ensure we're on the correct CPU */ WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) && raw_smp_processor_id() != pool->cpu); /* claim and dequeue */ debug_work_deactivate(work); hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work); worker->current_work = work; worker->current_func = work->func; worker->current_pwq = pwq; worker->current_at = worker->task->se.sum_exec_runtime; work_data = *work_data_bits(work); worker->current_color = get_work_color(work_data); /* * Record wq name for cmdline and debug reporting, may get * overridden through set_worker_desc(). */ strscpy(worker->desc, pwq->wq->name, WORKER_DESC_LEN); list_del_init(&work->entry); /* * CPU intensive works don't participate in concurrency management. * They're the scheduler's responsibility. This takes @worker out * of concurrency management and the next code block will chain * execution of the pending work items. */ if (unlikely(pwq->wq->flags & WQ_CPU_INTENSIVE)) worker_set_flags(worker, WORKER_CPU_INTENSIVE); /* * Kick @pool if necessary. It's always noop for per-cpu worker pools * since nr_running would always be >= 1 at this point. This is used to * chain execution of the pending work items for WORKER_NOT_RUNNING * workers such as the UNBOUND and CPU_INTENSIVE ones. */ kick_pool(pool); /* * Record the last pool and clear PENDING which should be the last * update to @work. Also, do this inside @pool->lock so that * PENDING and queued state changes happen together while IRQ is * disabled. */ set_work_pool_and_clear_pending(work, pool->id); pwq->stats[PWQ_STAT_STARTED]++; raw_spin_unlock_irq(&pool->lock); lock_map_acquire(&pwq->wq->lockdep_map); lock_map_acquire(&lockdep_map); /* * Strictly speaking we should mark the invariant state without holding * any locks, that is, before these two lock_map_acquire()'s. * * However, that would result in: * * A(W1) * WFC(C) * A(W1) * C(C) * * Which would create W1->C->W1 dependencies, even though there is no * actual deadlock possible. There are two solutions, using a * read-recursive acquire on the work(queue) 'locks', but this will then * hit the lockdep limitation on recursive locks, or simply discard * these locks. * * AFAICT there is no possible deadlock scenario between the * flush_work() and complete() primitives (except for single-threaded * workqueues), so hiding them isn't a problem. */ lockdep_invariant_state(true); trace_workqueue_execute_start(work); worker->current_func(work); /* * While we must be careful to not use "work" after this, the trace * point will only record its address. */ trace_workqueue_execute_end(work, worker->current_func); pwq->stats[PWQ_STAT_COMPLETED]++; lock_map_release(&lockdep_map); lock_map_release(&pwq->wq->lockdep_map); if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n" " last function: %ps\n", current->comm, preempt_count(), task_pid_nr(current), worker->current_func); debug_show_held_locks(current); dump_stack(); } /* * The following prevents a kworker from hogging CPU on !PREEMPTION * kernels, where a requeueing work item waiting for something to * happen could deadlock with stop_machine as such work item could * indefinitely requeue itself while all other CPUs are trapped in * stop_machine. At the same time, report a quiescent RCU state so * the same condition doesn't freeze RCU. */ cond_resched(); raw_spin_lock_irq(&pool->lock); /* * In addition to %WQ_CPU_INTENSIVE, @worker may also have been marked * CPU intensive by wq_worker_tick() if @work hogged CPU longer than * wq_cpu_intensive_thresh_us. Clear it. */ worker_clr_flags(worker, WORKER_CPU_INTENSIVE); /* tag the worker for identification in schedule() */ worker->last_func = worker->current_func; /* we're done with it, release */ hash_del(&worker->hentry); worker->current_work = NULL; worker->current_func = NULL; worker->current_pwq = NULL; worker->current_color = INT_MAX; pwq_dec_nr_in_flight(pwq, work_data); } /** * process_scheduled_works - process scheduled works * @worker: self * * Process all scheduled works. Please note that the scheduled list * may change while processing a work, so this function repeatedly * fetches a work from the top and executes it. * * CONTEXT: * raw_spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. */ static void process_scheduled_works(struct worker *worker) { struct work_struct *work; bool first = true; while ((work = list_first_entry_or_null(&worker->scheduled, struct work_struct, entry))) { if (first) { worker->pool->watchdog_ts = jiffies; first = false; } process_one_work(worker, work); } } static void set_pf_worker(bool val) { mutex_lock(&wq_pool_attach_mutex); if (val) current->flags |= PF_WQ_WORKER; else current->flags &= ~PF_WQ_WORKER; mutex_unlock(&wq_pool_attach_mutex); } /** * worker_thread - the worker thread function * @__worker: self * * The worker thread function. All workers belong to a worker_pool - * either a per-cpu one or dynamic unbound one. These workers process all * work items regardless of their specific target workqueue. The only * exception is work items which belong to workqueues with a rescuer which * will be explained in rescuer_thread(). * * Return: 0 */ static int worker_thread(void *__worker) { struct worker *worker = __worker; struct worker_pool *pool = worker->pool; /* tell the scheduler that this is a workqueue worker */ set_pf_worker(true); woke_up: raw_spin_lock_irq(&pool->lock); /* am I supposed to die? */ if (unlikely(worker->flags & WORKER_DIE)) { raw_spin_unlock_irq(&pool->lock); set_pf_worker(false); set_task_comm(worker->task, "kworker/dying"); ida_free(&pool->worker_ida, worker->id); worker_detach_from_pool(worker); WARN_ON_ONCE(!list_empty(&worker->entry)); kfree(worker); return 0; } worker_leave_idle(worker); recheck: /* no more worker necessary? */ if (!need_more_worker(pool)) goto sleep; /* do we need to manage? */ if (unlikely(!may_start_working(pool)) && manage_workers(worker)) goto recheck; /* * ->scheduled list can only be filled while a worker is * preparing to process a work or actually processing it. * Make sure nobody diddled with it while I was sleeping. */ WARN_ON_ONCE(!list_empty(&worker->scheduled)); /* * Finish PREP stage. We're guaranteed to have at least one idle * worker or that someone else has already assumed the manager * role. This is where @worker starts participating in concurrency * management if applicable and concurrency management is restored * after being rebound. See rebind_workers() for details. */ worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND); do { struct work_struct *work = list_first_entry(&pool->worklist, struct work_struct, entry); if (assign_work(work, worker, NULL)) process_scheduled_works(worker); } while (keep_working(pool)); worker_set_flags(worker, WORKER_PREP); sleep: /* * pool->lock is held and there's no work to process and no need to * manage, sleep. Workers are woken up only while holding * pool->lock or from local cpu, so setting the current state * before releasing pool->lock is enough to prevent losing any * event. */ worker_enter_idle(worker); __set_current_state(TASK_IDLE); raw_spin_unlock_irq(&pool->lock); schedule(); goto woke_up; } /** * rescuer_thread - the rescuer thread function * @__rescuer: self * * Workqueue rescuer thread function. There's one rescuer for each * workqueue which has WQ_MEM_RECLAIM set. * * Regular work processing on a pool may block trying to create a new * worker which uses GFP_KERNEL allocation which has slight chance of * developing into deadlock if some works currently on the same queue * need to be processed to satisfy the GFP_KERNEL allocation. This is * the problem rescuer solves. * * When such condition is possible, the pool summons rescuers of all * workqueues which have works queued on the pool and let them process * those works so that forward progress can be guaranteed. * * This should happen rarely. * * Return: 0 */ static int rescuer_thread(void *__rescuer) { struct worker *rescuer = __rescuer; struct workqueue_struct *wq = rescuer->rescue_wq; bool should_stop; set_user_nice(current, RESCUER_NICE_LEVEL); /* * Mark rescuer as worker too. As WORKER_PREP is never cleared, it * doesn't participate in concurrency management. */ set_pf_worker(true); repeat: set_current_state(TASK_IDLE); /* * By the time the rescuer is requested to stop, the workqueue * shouldn't have any work pending, but @wq->maydays may still have * pwq(s) queued. This can happen by non-rescuer workers consuming * all the work items before the rescuer got to them. Go through * @wq->maydays processing before acting on should_stop so that the * list is always empty on exit. */ should_stop = kthread_should_stop(); /* see whether any pwq is asking for help */ raw_spin_lock_irq(&wq_mayday_lock); while (!list_empty(&wq->maydays)) { struct pool_workqueue *pwq = list_first_entry(&wq->maydays, struct pool_workqueue, mayday_node); struct worker_pool *pool = pwq->pool; struct work_struct *work, *n; __set_current_state(TASK_RUNNING); list_del_init(&pwq->mayday_node); raw_spin_unlock_irq(&wq_mayday_lock); worker_attach_to_pool(rescuer, pool); raw_spin_lock_irq(&pool->lock); /* * Slurp in all works issued via this workqueue and * process'em. */ WARN_ON_ONCE(!list_empty(&rescuer->scheduled)); list_for_each_entry_safe(work, n, &pool->worklist, entry) { if (get_work_pwq(work) == pwq && assign_work(work, rescuer, &n)) pwq->stats[PWQ_STAT_RESCUED]++; } if (!list_empty(&rescuer->scheduled)) { process_scheduled_works(rescuer); /* * The above execution of rescued work items could * have created more to rescue through * pwq_activate_first_inactive() or chained * queueing. Let's put @pwq back on mayday list so * that such back-to-back work items, which may be * being used to relieve memory pressure, don't * incur MAYDAY_INTERVAL delay inbetween. */ if (pwq->nr_active && need_to_create_worker(pool)) { raw_spin_lock(&wq_mayday_lock); /* * Queue iff we aren't racing destruction * and somebody else hasn't queued it already. */ if (wq->rescuer && list_empty(&pwq->mayday_node)) { get_pwq(pwq); list_add_tail(&pwq->mayday_node, &wq->maydays); } raw_spin_unlock(&wq_mayday_lock); } } /* * Put the reference grabbed by send_mayday(). @pool won't * go away while we're still attached to it. */ put_pwq(pwq); /* * Leave this pool. Notify regular workers; otherwise, we end up * with 0 concurrency and stalling the execution. */ kick_pool(pool); raw_spin_unlock_irq(&pool->lock); worker_detach_from_pool(rescuer); raw_spin_lock_irq(&wq_mayday_lock); } raw_spin_unlock_irq(&wq_mayday_lock); if (should_stop) { __set_current_state(TASK_RUNNING); set_pf_worker(false); return 0; } /* rescuers should never participate in concurrency management */ WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING)); schedule(); goto repeat; } /** * check_flush_dependency - check for flush dependency sanity * @target_wq: workqueue being flushed * @target_work: work item being flushed (NULL for workqueue flushes) * * %current is trying to flush the whole @target_wq or @target_work on it. * If @target_wq doesn't have %WQ_MEM_RECLAIM, verify that %current is not * reclaiming memory or running on a workqueue which doesn't have * %WQ_MEM_RECLAIM as that can break forward-progress guarantee leading to * a deadlock. */ static void check_flush_dependency(struct workqueue_struct *target_wq, struct work_struct *target_work) { work_func_t target_func = target_work ? target_work->func : NULL; struct worker *worker; if (target_wq->flags & WQ_MEM_RECLAIM) return; worker = current_wq_worker(); WARN_ONCE(current->flags & PF_MEMALLOC, "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%ps", current->pid, current->comm, target_wq->name, target_func); WARN_ONCE(worker && ((worker->current_pwq->wq->flags & (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM), "workqueue: WQ_MEM_RECLAIM %s:%ps is flushing !WQ_MEM_RECLAIM %s:%ps", worker->current_pwq->wq->name, worker->current_func, target_wq->name, target_func); } struct wq_barrier { struct work_struct work; struct completion done; struct task_struct *task; /* purely informational */ }; static void wq_barrier_func(struct work_struct *work) { struct wq_barrier *barr = container_of(work, struct wq_barrier, work); complete(&barr->done); } /** * insert_wq_barrier - insert a barrier work * @pwq: pwq to insert barrier into * @barr: wq_barrier to insert * @target: target work to attach @barr to * @worker: worker currently executing @target, NULL if @target is not executing * * @barr is linked to @target such that @barr is completed only after * @target finishes execution. Please note that the ordering * guarantee is observed only with respect to @target and on the local * cpu. * * Currently, a queued barrier can't be canceled. This is because * try_to_grab_pending() can't determine whether the work to be * grabbed is at the head of the queue and thus can't clear LINKED * flag of the previous work while there must be a valid next work * after a work with LINKED flag set. * * Note that when @worker is non-NULL, @target may be modified * underneath us, so we can't reliably determine pwq from @target. * * CONTEXT: * raw_spin_lock_irq(pool->lock). */ static void insert_wq_barrier(struct pool_workqueue *pwq, struct wq_barrier *barr, struct work_struct *target, struct worker *worker) { unsigned int work_flags = 0; unsigned int work_color; struct list_head *head; /* * debugobject calls are safe here even with pool->lock locked * as we know for sure that this will not trigger any of the * checks and call back into the fixup functions where we * might deadlock. */ INIT_WORK_ONSTACK(&barr->work, wq_barrier_func); __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); init_completion_map(&barr->done, &target->lockdep_map); barr->task = current; /* The barrier work item does not participate in pwq->nr_active. */ work_flags |= WORK_STRUCT_INACTIVE; /* * If @target is currently being executed, schedule the * barrier to the worker; otherwise, put it after @target. */ if (worker) { head = worker->scheduled.next; work_color = worker->current_color; } else { unsigned long *bits = work_data_bits(target); head = target->entry.next; /* there can already be other linked works, inherit and set */ work_flags |= *bits & WORK_STRUCT_LINKED; work_color = get_work_color(*bits); __set_bit(WORK_STRUCT_LINKED_BIT, bits); } pwq->nr_in_flight[work_color]++; work_flags |= work_color_to_flags(work_color); insert_work(pwq, &barr->work, head, work_flags); } /** * flush_workqueue_prep_pwqs - prepare pwqs for workqueue flushing * @wq: workqueue being flushed * @flush_color: new flush color, < 0 for no-op * @work_color: new work color, < 0 for no-op * * Prepare pwqs for workqueue flushing. * * If @flush_color is non-negative, flush_color on all pwqs should be * -1. If no pwq has in-flight commands at the specified color, all * pwq->flush_color's stay at -1 and %false is returned. If any pwq * has in flight commands, its pwq->flush_color is set to * @flush_color, @wq->nr_pwqs_to_flush is updated accordingly, pwq * wakeup logic is armed and %true is returned. * * The caller should have initialized @wq->first_flusher prior to * calling this function with non-negative @flush_color. If * @flush_color is negative, no flush color update is done and %false * is returned. * * If @work_color is non-negative, all pwqs should have the same * work_color which is previous to @work_color and all will be * advanced to @work_color. * * CONTEXT: * mutex_lock(wq->mutex). * * Return: * %true if @flush_color >= 0 and there's something to flush. %false * otherwise. */ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, int flush_color, int work_color) { bool wait = false; struct pool_workqueue *pwq; if (flush_color >= 0) { WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush)); atomic_set(&wq->nr_pwqs_to_flush, 1); } for_each_pwq(pwq, wq) { struct worker_pool *pool = pwq->pool; raw_spin_lock_irq(&pool->lock); if (flush_color >= 0) { WARN_ON_ONCE(pwq->flush_color != -1); if (pwq->nr_in_flight[flush_color]) { pwq->flush_color = flush_color; atomic_inc(&wq->nr_pwqs_to_flush); wait = true; } } if (work_color >= 0) { WARN_ON_ONCE(work_color != work_next_color(pwq->work_color)); pwq->work_color = work_color; } raw_spin_unlock_irq(&pool->lock); } if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush)) complete(&wq->first_flusher->done); return wait; } /** * __flush_workqueue - ensure that any scheduled work has run to completion. * @wq: workqueue to flush * * This function sleeps until all work items which were queued on entry * have finished execution, but it is not livelocked by new incoming ones. */ void __flush_workqueue(struct workqueue_struct *wq) { struct wq_flusher this_flusher = { .list = LIST_HEAD_INIT(this_flusher.list), .flush_color = -1, .done = COMPLETION_INITIALIZER_ONSTACK_MAP(this_flusher.done, wq->lockdep_map), }; int next_color; if (WARN_ON(!wq_online)) return; lock_map_acquire(&wq->lockdep_map); lock_map_release(&wq->lockdep_map); mutex_lock(&wq->mutex); /* * Start-to-wait phase */ next_color = work_next_color(wq->work_color); if (next_color != wq->flush_color) { /* * Color space is not full. The current work_color * becomes our flush_color and work_color is advanced * by one. */ WARN_ON_ONCE(!list_empty(&wq->flusher_overflow)); this_flusher.flush_color = wq->work_color; wq->work_color = next_color; if (!wq->first_flusher) { /* no flush in progress, become the first flusher */ WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color); wq->first_flusher = &this_flusher; if (!flush_workqueue_prep_pwqs(wq, wq->flush_color, wq->work_color)) { /* nothing to flush, done */ wq->flush_color = next_color; wq->first_flusher = NULL; goto out_unlock; } } else { /* wait in queue */ WARN_ON_ONCE(wq->flush_color == this_flusher.flush_color); list_add_tail(&this_flusher.list, &wq->flusher_queue); flush_workqueue_prep_pwqs(wq, -1, wq->work_color); } } else { /* * Oops, color space is full, wait on overflow queue. * The next flush completion will assign us * flush_color and transfer to flusher_queue. */ list_add_tail(&this_flusher.list, &wq->flusher_overflow); } check_flush_dependency(wq, NULL); mutex_unlock(&wq->mutex); wait_for_completion(&this_flusher.done); /* * Wake-up-and-cascade phase * * First flushers are responsible for cascading flushes and * handling overflow. Non-first flushers can simply return. */ if (READ_ONCE(wq->first_flusher) != &this_flusher) return; mutex_lock(&wq->mutex); /* we might have raced, check again with mutex held */ if (wq->first_flusher != &this_flusher) goto out_unlock; WRITE_ONCE(wq->first_flusher, NULL); WARN_ON_ONCE(!list_empty(&this_flusher.list)); WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color); while (true) { struct wq_flusher *next, *tmp; /* complete all the flushers sharing the current flush color */ list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) { if (next->flush_color != wq->flush_color) break; list_del_init(&next->list); complete(&next->done); } WARN_ON_ONCE(!list_empty(&wq->flusher_overflow) && wq->flush_color != work_next_color(wq->work_color)); /* this flush_color is finished, advance by one */ wq->flush_color = work_next_color(wq->flush_color); /* one color has been freed, handle overflow queue */ if (!list_empty(&wq->flusher_overflow)) { /* * Assign the same color to all overflowed * flushers, advance work_color and append to * flusher_queue. This is the start-to-wait * phase for these overflowed flushers. */ list_for_each_entry(tmp, &wq->flusher_overflow, list) tmp->flush_color = wq->work_color; wq->work_color = work_next_color(wq->work_color); list_splice_tail_init(&wq->flusher_overflow, &wq->flusher_queue); flush_workqueue_prep_pwqs(wq, -1, wq->work_color); } if (list_empty(&wq->flusher_queue)) { WARN_ON_ONCE(wq->flush_color != wq->work_color); break; } /* * Need to flush more colors. Make the next flusher * the new first flusher and arm pwqs. */ WARN_ON_ONCE(wq->flush_color == wq->work_color); WARN_ON_ONCE(wq->flush_color != next->flush_color); list_del_init(&next->list); wq->first_flusher = next; if (flush_workqueue_prep_pwqs(wq, wq->flush_color, -1)) break; /* * Meh... this color is already done, clear first * flusher and repeat cascading. */ wq->first_flusher = NULL; } out_unlock: mutex_unlock(&wq->mutex); } EXPORT_SYMBOL(__flush_workqueue); /** * drain_workqueue - drain a workqueue * @wq: workqueue to drain * * Wait until the workqueue becomes empty. While draining is in progress, * only chain queueing is allowed. IOW, only currently pending or running * work items on @wq can queue further work items on it. @wq is flushed * repeatedly until it becomes empty. The number of flushing is determined * by the depth of chaining and should be relatively short. Whine if it * takes too long. */ void drain_workqueue(struct workqueue_struct *wq) { unsigned int flush_cnt = 0; struct pool_workqueue *pwq; /* * __queue_work() needs to test whether there are drainers, is much * hotter than drain_workqueue() and already looks at @wq->flags. * Use __WQ_DRAINING so that queue doesn't have to check nr_drainers. */ mutex_lock(&wq->mutex); if (!wq->nr_drainers++) wq->flags |= __WQ_DRAINING; mutex_unlock(&wq->mutex); reflush: __flush_workqueue(wq); mutex_lock(&wq->mutex); for_each_pwq(pwq, wq) { bool drained; raw_spin_lock_irq(&pwq->pool->lock); drained = !pwq->nr_active && list_empty(&pwq->inactive_works); raw_spin_unlock_irq(&pwq->pool->lock); if (drained) continue; if (++flush_cnt == 10 || (flush_cnt % 100 == 0 && flush_cnt <= 1000)) pr_warn("workqueue %s: %s() isn't complete after %u tries\n", wq->name, __func__, flush_cnt); mutex_unlock(&wq->mutex); goto reflush; } if (!--wq->nr_drainers) wq->flags &= ~__WQ_DRAINING; mutex_unlock(&wq->mutex); } EXPORT_SYMBOL_GPL(drain_workqueue); static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, bool from_cancel) { struct worker *worker = NULL; struct worker_pool *pool; struct pool_workqueue *pwq; might_sleep(); rcu_read_lock(); pool = get_work_pool(work); if (!pool) { rcu_read_unlock(); return false; } raw_spin_lock_irq(&pool->lock); /* see the comment in try_to_grab_pending() with the same code */ pwq = get_work_pwq(work); if (pwq) { if (unlikely(pwq->pool != pool)) goto already_gone; } else { worker = find_worker_executing_work(pool, work); if (!worker) goto already_gone; pwq = worker->current_pwq; } check_flush_dependency(pwq->wq, work); insert_wq_barrier(pwq, barr, work, worker); raw_spin_unlock_irq(&pool->lock); /* * Force a lock recursion deadlock when using flush_work() inside a * single-threaded or rescuer equipped workqueue. * * For single threaded workqueues the deadlock happens when the work * is after the work issuing the flush_work(). For rescuer equipped * workqueues the deadlock happens when the rescuer stalls, blocking * forward progress. */ if (!from_cancel && (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)) { lock_map_acquire(&pwq->wq->lockdep_map); lock_map_release(&pwq->wq->lockdep_map); } rcu_read_unlock(); return true; already_gone: raw_spin_unlock_irq(&pool->lock); rcu_read_unlock(); return false; } static bool __flush_work(struct work_struct *work, bool from_cancel) { struct wq_barrier barr; if (WARN_ON(!wq_online)) return false; if (WARN_ON(!work->func)) return false; lock_map_acquire(&work->lockdep_map); lock_map_release(&work->lockdep_map); if (start_flush_work(work, &barr, from_cancel)) { wait_for_completion(&barr.done); destroy_work_on_stack(&barr.work); return true; } else { return false; } } /** * flush_work - wait for a work to finish executing the last queueing instance * @work: the work to flush * * Wait until @work has finished execution. @work is guaranteed to be idle * on return if it hasn't been requeued since flush started. * * Return: * %true if flush_work() waited for the work to finish execution, * %false if it was already idle. */ bool flush_work(struct work_struct *work) { return __flush_work(work, false); } EXPORT_SYMBOL_GPL(flush_work); struct cwt_wait { wait_queue_entry_t wait; struct work_struct *work; }; static int cwt_wakefn(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait); if (cwait->work != key) return 0; return autoremove_wake_function(wait, mode, sync, key); } static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) { static DECLARE_WAIT_QUEUE_HEAD(cancel_waitq); unsigned long flags; int ret; do { ret = try_to_grab_pending(work, is_dwork, &flags); /* * If someone else is already canceling, wait for it to * finish. flush_work() doesn't work for PREEMPT_NONE * because we may get scheduled between @work's completion * and the other canceling task resuming and clearing * CANCELING - flush_work() will return false immediately * as @work is no longer busy, try_to_grab_pending() will * return -ENOENT as @work is still being canceled and the * other canceling task won't be able to clear CANCELING as * we're hogging the CPU. * * Let's wait for completion using a waitqueue. As this * may lead to the thundering herd problem, use a custom * wake function which matches @work along with exclusive * wait and wakeup. */ if (unlikely(ret == -ENOENT)) { struct cwt_wait cwait; init_wait(&cwait.wait); cwait.wait.func = cwt_wakefn; cwait.work = work; prepare_to_wait_exclusive(&cancel_waitq, &cwait.wait, TASK_UNINTERRUPTIBLE); if (work_is_canceling(work)) schedule(); finish_wait(&cancel_waitq, &cwait.wait); } } while (unlikely(ret < 0)); /* tell other tasks trying to grab @work to back off */ mark_work_canceling(work); local_irq_restore(flags); /* * This allows canceling during early boot. We know that @work * isn't executing. */ if (wq_online) __flush_work(work, true); clear_work_data(work); /* * Paired with prepare_to_wait() above so that either * waitqueue_active() is visible here or !work_is_canceling() is * visible there. */ smp_mb(); if (waitqueue_active(&cancel_waitq)) __wake_up(&cancel_waitq, TASK_NORMAL, 1, work); return ret; } /** * cancel_work_sync - cancel a work and wait for it to finish * @work: the work to cancel * * Cancel @work and wait for its execution to finish. This function * can be used even if the work re-queues itself or migrates to * another workqueue. On return from this function, @work is * guaranteed to be not pending or executing on any CPU. * * cancel_work_sync(&delayed_work->work) must not be used for * delayed_work's. Use cancel_delayed_work_sync() instead. * * The caller must ensure that the workqueue on which @work was last * queued can't be destroyed before this function returns. * * Return: * %true if @work was pending, %false otherwise. */ bool cancel_work_sync(struct work_struct *work) { return __cancel_work_timer(work, false); } EXPORT_SYMBOL_GPL(cancel_work_sync); /** * flush_delayed_work - wait for a dwork to finish executing the last queueing * @dwork: the delayed work to flush * * Delayed timer is cancelled and the pending work is queued for * immediate execution. Like flush_work(), this function only * considers the last queueing instance of @dwork. * * Return: * %true if flush_work() waited for the work to finish execution, * %false if it was already idle. */ bool flush_delayed_work(struct delayed_work *dwork) { local_irq_disable(); if (del_timer_sync(&dwork->timer)) __queue_work(dwork->cpu, dwork->wq, &dwork->work); local_irq_enable(); return flush_work(&dwork->work); } EXPORT_SYMBOL(flush_delayed_work); /** * flush_rcu_work - wait for a rwork to finish executing the last queueing * @rwork: the rcu work to flush * * Return: * %true if flush_rcu_work() waited for the work to finish execution, * %false if it was already idle. */ bool flush_rcu_work(struct rcu_work *rwork) { if (test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&rwork->work))) { rcu_barrier(); flush_work(&rwork->work); return true; } else { return flush_work(&rwork->work); } } EXPORT_SYMBOL(flush_rcu_work); static bool __cancel_work(struct work_struct *work, bool is_dwork) { unsigned long flags; int ret; do { ret = try_to_grab_pending(work, is_dwork, &flags); } while (unlikely(ret == -EAGAIN)); if (unlikely(ret < 0)) return false; set_work_pool_and_clear_pending(work, get_work_pool_id(work)); local_irq_restore(flags); return ret; } /* * See cancel_delayed_work() */ bool cancel_work(struct work_struct *work) { return __cancel_work(work, false); } EXPORT_SYMBOL(cancel_work); /** * cancel_delayed_work - cancel a delayed work * @dwork: delayed_work to cancel * * Kill off a pending delayed_work. * * Return: %true if @dwork was pending and canceled; %false if it wasn't * pending. * * Note: * The work callback function may still be running on return, unless * it returns %true and the work doesn't re-arm itself. Explicitly flush or * use cancel_delayed_work_sync() to wait on it. * * This function is safe to call from any context including IRQ handler. */ bool cancel_delayed_work(struct delayed_work *dwork) { return __cancel_work(&dwork->work, true); } EXPORT_SYMBOL(cancel_delayed_work); /** * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish * @dwork: the delayed work cancel * * This is cancel_work_sync() for delayed works. * * Return: * %true if @dwork was pending, %false otherwise. */ bool cancel_delayed_work_sync(struct delayed_work *dwork) { return __cancel_work_timer(&dwork->work, true); } EXPORT_SYMBOL(cancel_delayed_work_sync); /** * schedule_on_each_cpu - execute a function synchronously on each online CPU * @func: the function to call * * schedule_on_each_cpu() executes @func on each online CPU using the * system workqueue and blocks until all CPUs have completed. * schedule_on_each_cpu() is very slow. * * Return: * 0 on success, -errno on failure. */ int schedule_on_each_cpu(work_func_t func) { int cpu; struct work_struct __percpu *works; works = alloc_percpu(struct work_struct); if (!works) return -ENOMEM; cpus_read_lock(); for_each_online_cpu(cpu) { struct work_struct *work = per_cpu_ptr(works, cpu); INIT_WORK(work, func); schedule_work_on(cpu, work); } for_each_online_cpu(cpu) flush_work(per_cpu_ptr(works, cpu)); cpus_read_unlock(); free_percpu(works); return 0; } /** * execute_in_process_context - reliably execute the routine with user context * @fn: the function to execute * @ew: guaranteed storage for the execute work structure (must * be available when the work executes) * * Executes the function immediately if process context is available, * otherwise schedules the function for delayed execution. * * Return: 0 - function was executed * 1 - function was scheduled for execution */ int execute_in_process_context(work_func_t fn, struct execute_work *ew) { if (!in_interrupt()) { fn(&ew->work); return 0; } INIT_WORK(&ew->work, fn); schedule_work(&ew->work); return 1; } EXPORT_SYMBOL_GPL(execute_in_process_context); /** * free_workqueue_attrs - free a workqueue_attrs * @attrs: workqueue_attrs to free * * Undo alloc_workqueue_attrs(). */ void free_workqueue_attrs(struct workqueue_attrs *attrs) { if (attrs) { free_cpumask_var(attrs->cpumask); free_cpumask_var(attrs->__pod_cpumask); kfree(attrs); } } /** * alloc_workqueue_attrs - allocate a workqueue_attrs * * Allocate a new workqueue_attrs, initialize with default settings and * return it. * * Return: The allocated new workqueue_attr on success. %NULL on failure. */ struct workqueue_attrs *alloc_workqueue_attrs(void) { struct workqueue_attrs *attrs; attrs = kzalloc(sizeof(*attrs), GFP_KERNEL); if (!attrs) goto fail; if (!alloc_cpumask_var(&attrs->cpumask, GFP_KERNEL)) goto fail; if (!alloc_cpumask_var(&attrs->__pod_cpumask, GFP_KERNEL)) goto fail; cpumask_copy(attrs->cpumask, cpu_possible_mask); attrs->affn_scope = WQ_AFFN_DFL; return attrs; fail: free_workqueue_attrs(attrs); return NULL; } static void copy_workqueue_attrs(struct workqueue_attrs *to, const struct workqueue_attrs *from) { to->nice = from->nice; cpumask_copy(to->cpumask, from->cpumask); cpumask_copy(to->__pod_cpumask, from->__pod_cpumask); to->affn_strict = from->affn_strict; /* * Unlike hash and equality test, copying shouldn't ignore wq-only * fields as copying is used for both pool and wq attrs. Instead, * get_unbound_pool() explicitly clears the fields. */ to->affn_scope = from->affn_scope; to->ordered = from->ordered; } /* * Some attrs fields are workqueue-only. Clear them for worker_pool's. See the * comments in 'struct workqueue_attrs' definition. */ static void wqattrs_clear_for_pool(struct workqueue_attrs *attrs) { attrs->affn_scope = WQ_AFFN_NR_TYPES; attrs->ordered = false; } /* hash value of the content of @attr */ static u32 wqattrs_hash(const struct workqueue_attrs *attrs) { u32 hash = 0; hash = jhash_1word(attrs->nice, hash); hash = jhash(cpumask_bits(attrs->cpumask), BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); hash = jhash(cpumask_bits(attrs->__pod_cpumask), BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); hash = jhash_1word(attrs->affn_strict, hash); return hash; } /* content equality test */ static bool wqattrs_equal(const struct workqueue_attrs *a, const struct workqueue_attrs *b) { if (a->nice != b->nice) return false; if (!cpumask_equal(a->cpumask, b->cpumask)) return false; if (!cpumask_equal(a->__pod_cpumask, b->__pod_cpumask)) return false; if (a->affn_strict != b->affn_strict) return false; return true; } /* Update @attrs with actually available CPUs */ static void wqattrs_actualize_cpumask(struct workqueue_attrs *attrs, const cpumask_t *unbound_cpumask) { /* * Calculate the effective CPU mask of @attrs given @unbound_cpumask. If * @attrs->cpumask doesn't overlap with @unbound_cpumask, we fallback to * @unbound_cpumask. */ cpumask_and(attrs->cpumask, attrs->cpumask, unbound_cpumask); if (unlikely(cpumask_empty(attrs->cpumask))) cpumask_copy(attrs->cpumask, unbound_cpumask); } /* find wq_pod_type to use for @attrs */ static const struct wq_pod_type * wqattrs_pod_type(const struct workqueue_attrs *attrs) { enum wq_affn_scope scope; struct wq_pod_type *pt; /* to synchronize access to wq_affn_dfl */ lockdep_assert_held(&wq_pool_mutex); if (attrs->affn_scope == WQ_AFFN_DFL) scope = wq_affn_dfl; else scope = attrs->affn_scope; pt = &wq_pod_types[scope]; if (!WARN_ON_ONCE(attrs->affn_scope == WQ_AFFN_NR_TYPES) && likely(pt->nr_pods)) return pt; /* * Before workqueue_init_topology(), only SYSTEM is available which is * initialized in workqueue_init_early(). */ pt = &wq_pod_types[WQ_AFFN_SYSTEM]; BUG_ON(!pt->nr_pods); return pt; } /** * init_worker_pool - initialize a newly zalloc'd worker_pool * @pool: worker_pool to initialize * * Initialize a newly zalloc'd @pool. It also allocates @pool->attrs. * * Return: 0 on success, -errno on failure. Even on failure, all fields * inside @pool proper are initialized and put_unbound_pool() can be called * on @pool safely to release it. */ static int init_worker_pool(struct worker_pool *pool) { raw_spin_lock_init(&pool->lock); pool->id = -1; pool->cpu = -1; pool->node = NUMA_NO_NODE; pool->flags |= POOL_DISASSOCIATED; pool->watchdog_ts = jiffies; INIT_LIST_HEAD(&pool->worklist); INIT_LIST_HEAD(&pool->idle_list); hash_init(pool->busy_hash); timer_setup(&pool->idle_timer, idle_worker_timeout, TIMER_DEFERRABLE); INIT_WORK(&pool->idle_cull_work, idle_cull_fn); timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0); INIT_LIST_HEAD(&pool->workers); INIT_LIST_HEAD(&pool->dying_workers); ida_init(&pool->worker_ida); INIT_HLIST_NODE(&pool->hash_node); pool->refcnt = 1; /* shouldn't fail above this point */ pool->attrs = alloc_workqueue_attrs(); if (!pool->attrs) return -ENOMEM; wqattrs_clear_for_pool(pool->attrs); return 0; } #ifdef CONFIG_LOCKDEP static void wq_init_lockdep(struct workqueue_struct *wq) { char *lock_name; lockdep_register_key(&wq->key); lock_name = kasprintf(GFP_KERNEL, "%s%s", "(wq_completion)", wq->name); if (!lock_name) lock_name = wq->name; wq->lock_name = lock_name; lockdep_init_map(&wq->lockdep_map, lock_name, &wq->key, 0); } static void wq_unregister_lockdep(struct workqueue_struct *wq) { lockdep_unregister_key(&wq->key); } static void wq_free_lockdep(struct workqueue_struct *wq) { if (wq->lock_name != wq->name) kfree(wq->lock_name); } #else static void wq_init_lockdep(struct workqueue_struct *wq) { } static void wq_unregister_lockdep(struct workqueue_struct *wq) { } static void wq_free_lockdep(struct workqueue_struct *wq) { } #endif static void rcu_free_wq(struct rcu_head *rcu) { struct workqueue_struct *wq = container_of(rcu, struct workqueue_struct, rcu); wq_free_lockdep(wq); free_percpu(wq->cpu_pwq); free_workqueue_attrs(wq->unbound_attrs); kfree(wq); } static void rcu_free_pool(struct rcu_head *rcu) { struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); ida_destroy(&pool->worker_ida); free_workqueue_attrs(pool->attrs); kfree(pool); } /** * put_unbound_pool - put a worker_pool * @pool: worker_pool to put * * Put @pool. If its refcnt reaches zero, it gets destroyed in RCU * safe manner. get_unbound_pool() calls this function on its failure path * and this function should be able to release pools which went through, * successfully or not, init_worker_pool(). * * Should be called with wq_pool_mutex held. */ static void put_unbound_pool(struct worker_pool *pool) { DECLARE_COMPLETION_ONSTACK(detach_completion); struct worker *worker; LIST_HEAD(cull_list); lockdep_assert_held(&wq_pool_mutex); if (--pool->refcnt) return; /* sanity checks */ if (WARN_ON(!(pool->cpu < 0)) || WARN_ON(!list_empty(&pool->worklist))) return; /* release id and unhash */ if (pool->id >= 0) idr_remove(&worker_pool_idr, pool->id); hash_del(&pool->hash_node); /* * Become the manager and destroy all workers. This prevents * @pool's workers from blocking on attach_mutex. We're the last * manager and @pool gets freed with the flag set. * * Having a concurrent manager is quite unlikely to happen as we can * only get here with * pwq->refcnt == pool->refcnt == 0 * which implies no work queued to the pool, which implies no worker can * become the manager. However a worker could have taken the role of * manager before the refcnts dropped to 0, since maybe_create_worker() * drops pool->lock */ while (true) { rcuwait_wait_event(&manager_wait, !(pool->flags & POOL_MANAGER_ACTIVE), TASK_UNINTERRUPTIBLE); mutex_lock(&wq_pool_attach_mutex); raw_spin_lock_irq(&pool->lock); if (!(pool->flags & POOL_MANAGER_ACTIVE)) { pool->flags |= POOL_MANAGER_ACTIVE; break; } raw_spin_unlock_irq(&pool->lock); mutex_unlock(&wq_pool_attach_mutex); } while ((worker = first_idle_worker(pool))) set_worker_dying(worker, &cull_list); WARN_ON(pool->nr_workers || pool->nr_idle); raw_spin_unlock_irq(&pool->lock); wake_dying_workers(&cull_list); if (!list_empty(&pool->workers) || !list_empty(&pool->dying_workers)) pool->detach_completion = &detach_completion; mutex_unlock(&wq_pool_attach_mutex); if (pool->detach_completion) wait_for_completion(pool->detach_completion); /* shut down the timers */ del_timer_sync(&pool->idle_timer); cancel_work_sync(&pool->idle_cull_work); del_timer_sync(&pool->mayday_timer); /* RCU protected to allow dereferences from get_work_pool() */ call_rcu(&pool->rcu, rcu_free_pool); } /** * get_unbound_pool - get a worker_pool with the specified attributes * @attrs: the attributes of the worker_pool to get * * Obtain a worker_pool which has the same attributes as @attrs, bump the * reference count and return it. If there already is a matching * worker_pool, it will be used; otherwise, this function attempts to * create a new one. * * Should be called with wq_pool_mutex held. * * Return: On success, a worker_pool with the same attributes as @attrs. * On failure, %NULL. */ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) { struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_NUMA]; u32 hash = wqattrs_hash(attrs); struct worker_pool *pool; int pod, node = NUMA_NO_NODE; lockdep_assert_held(&wq_pool_mutex); /* do we already have a matching pool? */ hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) { if (wqattrs_equal(pool->attrs, attrs)) { pool->refcnt++; return pool; } } /* If __pod_cpumask is contained inside a NUMA pod, that's our node */ for (pod = 0; pod < pt->nr_pods; pod++) { if (cpumask_subset(attrs->__pod_cpumask, pt->pod_cpus[pod])) { node = pt->pod_node[pod]; break; } } /* nope, create a new one */ pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, node); if (!pool || init_worker_pool(pool) < 0) goto fail; pool->node = node; copy_workqueue_attrs(pool->attrs, attrs); wqattrs_clear_for_pool(pool->attrs); if (worker_pool_assign_id(pool) < 0) goto fail; /* create and start the initial worker */ if (wq_online && !create_worker(pool)) goto fail; /* install */ hash_add(unbound_pool_hash, &pool->hash_node, hash); return pool; fail: if (pool) put_unbound_pool(pool); return NULL; } static void rcu_free_pwq(struct rcu_head *rcu) { kmem_cache_free(pwq_cache, container_of(rcu, struct pool_workqueue, rcu)); } /* * Scheduled on pwq_release_worker by put_pwq() when an unbound pwq hits zero * refcnt and needs to be destroyed. */ static void pwq_release_workfn(struct kthread_work *work) { struct pool_workqueue *pwq = container_of(work, struct pool_workqueue, release_work); struct workqueue_struct *wq = pwq->wq; struct worker_pool *pool = pwq->pool; bool is_last = false; /* * When @pwq is not linked, it doesn't hold any reference to the * @wq, and @wq is invalid to access. */ if (!list_empty(&pwq->pwqs_node)) { mutex_lock(&wq->mutex); list_del_rcu(&pwq->pwqs_node); is_last = list_empty(&wq->pwqs); mutex_unlock(&wq->mutex); } if (wq->flags & WQ_UNBOUND) { mutex_lock(&wq_pool_mutex); put_unbound_pool(pool); mutex_unlock(&wq_pool_mutex); } call_rcu(&pwq->rcu, rcu_free_pwq); /* * If we're the last pwq going away, @wq is already dead and no one * is gonna access it anymore. Schedule RCU free. */ if (is_last) { wq_unregister_lockdep(wq); call_rcu(&wq->rcu, rcu_free_wq); } } /** * pwq_adjust_max_active - update a pwq's max_active to the current setting * @pwq: target pool_workqueue * * If @pwq isn't freezing, set @pwq->max_active to the associated * workqueue's saved_max_active and activate inactive work items * accordingly. If @pwq is freezing, clear @pwq->max_active to zero. */ static void pwq_adjust_max_active(struct pool_workqueue *pwq) { struct workqueue_struct *wq = pwq->wq; bool freezable = wq->flags & WQ_FREEZABLE; unsigned long flags; /* for @wq->saved_max_active */ lockdep_assert_held(&wq->mutex); /* fast exit for non-freezable wqs */ if (!freezable && pwq->max_active == wq->saved_max_active) return; /* this function can be called during early boot w/ irq disabled */ raw_spin_lock_irqsave(&pwq->pool->lock, flags); /* * During [un]freezing, the caller is responsible for ensuring that * this function is called at least once after @workqueue_freezing * is updated and visible. */ if (!freezable || !workqueue_freezing) { pwq->max_active = wq->saved_max_active; while (!list_empty(&pwq->inactive_works) && pwq->nr_active < pwq->max_active) pwq_activate_first_inactive(pwq); kick_pool(pwq->pool); } else { pwq->max_active = 0; } raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); } /* initialize newly allocated @pwq which is associated with @wq and @pool */ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, struct worker_pool *pool) { BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK); memset(pwq, 0, sizeof(*pwq)); pwq->pool = pool; pwq->wq = wq; pwq->flush_color = -1; pwq->refcnt = 1; INIT_LIST_HEAD(&pwq->inactive_works); INIT_LIST_HEAD(&pwq->pwqs_node); INIT_LIST_HEAD(&pwq->mayday_node); kthread_init_work(&pwq->release_work, pwq_release_workfn); } /* sync @pwq with the current state of its associated wq and link it */ static void link_pwq(struct pool_workqueue *pwq) { struct workqueue_struct *wq = pwq->wq; lockdep_assert_held(&wq->mutex); /* may be called multiple times, ignore if already linked */ if (!list_empty(&pwq->pwqs_node)) return; /* set the matching work_color */ pwq->work_color = wq->work_color; /* sync max_active to the current setting */ pwq_adjust_max_active(pwq); /* link in @pwq */ list_add_rcu(&pwq->pwqs_node, &wq->pwqs); } /* obtain a pool matching @attr and create a pwq associating the pool and @wq */ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, const struct workqueue_attrs *attrs) { struct worker_pool *pool; struct pool_workqueue *pwq; lockdep_assert_held(&wq_pool_mutex); pool = get_unbound_pool(attrs); if (!pool) return NULL; pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node); if (!pwq) { put_unbound_pool(pool); return NULL; } init_pwq(pwq, wq, pool); return pwq; } /** * wq_calc_pod_cpumask - calculate a wq_attrs' cpumask for a pod * @attrs: the wq_attrs of the default pwq of the target workqueue * @cpu: the target CPU * @cpu_going_down: if >= 0, the CPU to consider as offline * * Calculate the cpumask a workqueue with @attrs should use on @pod. If * @cpu_going_down is >= 0, that cpu is considered offline during calculation. * The result is stored in @attrs->__pod_cpumask. * * If pod affinity is not enabled, @attrs->cpumask is always used. If enabled * and @pod has online CPUs requested by @attrs, the returned cpumask is the * intersection of the possible CPUs of @pod and @attrs->cpumask. * * The caller is responsible for ensuring that the cpumask of @pod stays stable. */ static void wq_calc_pod_cpumask(struct workqueue_attrs *attrs, int cpu, int cpu_going_down) { const struct wq_pod_type *pt = wqattrs_pod_type(attrs); int pod = pt->cpu_pod[cpu]; /* does @pod have any online CPUs @attrs wants? */ cpumask_and(attrs->__pod_cpumask, pt->pod_cpus[pod], attrs->cpumask); cpumask_and(attrs->__pod_cpumask, attrs->__pod_cpumask, cpu_online_mask); if (cpu_going_down >= 0) cpumask_clear_cpu(cpu_going_down, attrs->__pod_cpumask); if (cpumask_empty(attrs->__pod_cpumask)) { cpumask_copy(attrs->__pod_cpumask, attrs->cpumask); return; } /* yeap, return possible CPUs in @pod that @attrs wants */ cpumask_and(attrs->__pod_cpumask, attrs->cpumask, pt->pod_cpus[pod]); if (cpumask_empty(attrs->__pod_cpumask)) pr_warn_once("WARNING: workqueue cpumask: online intersect > " "possible intersect\n"); } /* install @pwq into @wq's cpu_pwq and return the old pwq */ static struct pool_workqueue *install_unbound_pwq(struct workqueue_struct *wq, int cpu, struct pool_workqueue *pwq) { struct pool_workqueue *old_pwq; lockdep_assert_held(&wq_pool_mutex); lockdep_assert_held(&wq->mutex); /* link_pwq() can handle duplicate calls */ link_pwq(pwq); old_pwq = rcu_access_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu)); rcu_assign_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu), pwq); return old_pwq; } /* context to store the prepared attrs & pwqs before applying */ struct apply_wqattrs_ctx { struct workqueue_struct *wq; /* target workqueue */ struct workqueue_attrs *attrs; /* attrs to apply */ struct list_head list; /* queued for batching commit */ struct pool_workqueue *dfl_pwq; struct pool_workqueue *pwq_tbl[]; }; /* free the resources after success or abort */ static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx) { if (ctx) { int cpu; for_each_possible_cpu(cpu) put_pwq_unlocked(ctx->pwq_tbl[cpu]); put_pwq_unlocked(ctx->dfl_pwq); free_workqueue_attrs(ctx->attrs); kfree(ctx); } } /* allocate the attrs and pwqs for later installation */ static struct apply_wqattrs_ctx * apply_wqattrs_prepare(struct workqueue_struct *wq, const struct workqueue_attrs *attrs, const cpumask_var_t unbound_cpumask) { struct apply_wqattrs_ctx *ctx; struct workqueue_attrs *new_attrs; int cpu; lockdep_assert_held(&wq_pool_mutex); if (WARN_ON(attrs->affn_scope < 0 || attrs->affn_scope >= WQ_AFFN_NR_TYPES)) return ERR_PTR(-EINVAL); ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_cpu_ids), GFP_KERNEL); new_attrs = alloc_workqueue_attrs(); if (!ctx || !new_attrs) goto out_free; /* * If something goes wrong during CPU up/down, we'll fall back to * the default pwq covering whole @attrs->cpumask. Always create * it even if we don't use it immediately. */ copy_workqueue_attrs(new_attrs, attrs); wqattrs_actualize_cpumask(new_attrs, unbound_cpumask); cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask); ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs); if (!ctx->dfl_pwq) goto out_free; for_each_possible_cpu(cpu) { if (new_attrs->ordered) { ctx->dfl_pwq->refcnt++; ctx->pwq_tbl[cpu] = ctx->dfl_pwq; } else { wq_calc_pod_cpumask(new_attrs, cpu, -1); ctx->pwq_tbl[cpu] = alloc_unbound_pwq(wq, new_attrs); if (!ctx->pwq_tbl[cpu]) goto out_free; } } /* save the user configured attrs and sanitize it. */ copy_workqueue_attrs(new_attrs, attrs); cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask); cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask); ctx->attrs = new_attrs; ctx->wq = wq; return ctx; out_free: free_workqueue_attrs(new_attrs); apply_wqattrs_cleanup(ctx); return ERR_PTR(-ENOMEM); } /* set attrs and install prepared pwqs, @ctx points to old pwqs on return */ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx) { int cpu; /* all pwqs have been created successfully, let's install'em */ mutex_lock(&ctx->wq->mutex); copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs); /* save the previous pwq and install the new one */ for_each_possible_cpu(cpu) ctx->pwq_tbl[cpu] = install_unbound_pwq(ctx->wq, cpu, ctx->pwq_tbl[cpu]); /* @dfl_pwq might not have been used, ensure it's linked */ link_pwq(ctx->dfl_pwq); swap(ctx->wq->dfl_pwq, ctx->dfl_pwq); mutex_unlock(&ctx->wq->mutex); } static void apply_wqattrs_lock(void) { /* CPUs should stay stable across pwq creations and installations */ cpus_read_lock(); mutex_lock(&wq_pool_mutex); } static void apply_wqattrs_unlock(void) { mutex_unlock(&wq_pool_mutex); cpus_read_unlock(); } static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, const struct workqueue_attrs *attrs) { struct apply_wqattrs_ctx *ctx; /* only unbound workqueues can change attributes */ if (WARN_ON(!(wq->flags & WQ_UNBOUND))) return -EINVAL; /* creating multiple pwqs breaks ordering guarantee */ if (!list_empty(&wq->pwqs)) { if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT)) return -EINVAL; wq->flags &= ~__WQ_ORDERED; } ctx = apply_wqattrs_prepare(wq, attrs, wq_unbound_cpumask); if (IS_ERR(ctx)) return PTR_ERR(ctx); /* the ctx has been prepared successfully, let's commit it */ apply_wqattrs_commit(ctx); apply_wqattrs_cleanup(ctx); return 0; } /** * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue * @wq: the target workqueue * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs() * * Apply @attrs to an unbound workqueue @wq. Unless disabled, this function maps * a separate pwq to each CPU pod with possibles CPUs in @attrs->cpumask so that * work items are affine to the pod it was issued on. Older pwqs are released as * in-flight work items finish. Note that a work item which repeatedly requeues * itself back-to-back will stay on its current pwq. * * Performs GFP_KERNEL allocations. * * Assumes caller has CPU hotplug read exclusion, i.e. cpus_read_lock(). * * Return: 0 on success and -errno on failure. */ int apply_workqueue_attrs(struct workqueue_struct *wq, const struct workqueue_attrs *attrs) { int ret; lockdep_assert_cpus_held(); mutex_lock(&wq_pool_mutex); ret = apply_workqueue_attrs_locked(wq, attrs); mutex_unlock(&wq_pool_mutex); return ret; } /** * wq_update_pod - update pod affinity of a wq for CPU hot[un]plug * @wq: the target workqueue * @cpu: the CPU to update pool association for * @hotplug_cpu: the CPU coming up or going down * @online: whether @cpu is coming up or going down * * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and * %CPU_DOWN_FAILED. @cpu is being hot[un]plugged, update pod affinity of * @wq accordingly. * * * If pod affinity can't be adjusted due to memory allocation failure, it falls * back to @wq->dfl_pwq which may not be optimal but is always correct. * * Note that when the last allowed CPU of a pod goes offline for a workqueue * with a cpumask spanning multiple pods, the workers which were already * executing the work items for the workqueue will lose their CPU affinity and * may execute on any CPU. This is similar to how per-cpu workqueues behave on * CPU_DOWN. If a workqueue user wants strict affinity, it's the user's * responsibility to flush the work item from CPU_DOWN_PREPARE. */ static void wq_update_pod(struct workqueue_struct *wq, int cpu, int hotplug_cpu, bool online) { int off_cpu = online ? -1 : hotplug_cpu; struct pool_workqueue *old_pwq = NULL, *pwq; struct workqueue_attrs *target_attrs; lockdep_assert_held(&wq_pool_mutex); if (!(wq->flags & WQ_UNBOUND) || wq->unbound_attrs->ordered) return; /* * We don't wanna alloc/free wq_attrs for each wq for each CPU. * Let's use a preallocated one. The following buf is protected by * CPU hotplug exclusion. */ target_attrs = wq_update_pod_attrs_buf; copy_workqueue_attrs(target_attrs, wq->unbound_attrs); wqattrs_actualize_cpumask(target_attrs, wq_unbound_cpumask); /* nothing to do if the target cpumask matches the current pwq */ wq_calc_pod_cpumask(target_attrs, cpu, off_cpu); pwq = rcu_dereference_protected(*per_cpu_ptr(wq->cpu_pwq, cpu), lockdep_is_held(&wq_pool_mutex)); if (wqattrs_equal(target_attrs, pwq->pool->attrs)) return; /* create a new pwq */ pwq = alloc_unbound_pwq(wq, target_attrs); if (!pwq) { pr_warn("workqueue: allocation failed while updating CPU pod affinity of \"%s\"\n", wq->name); goto use_dfl_pwq; } /* Install the new pwq. */ mutex_lock(&wq->mutex); old_pwq = install_unbound_pwq(wq, cpu, pwq); goto out_unlock; use_dfl_pwq: mutex_lock(&wq->mutex); raw_spin_lock_irq(&wq->dfl_pwq->pool->lock); get_pwq(wq->dfl_pwq); raw_spin_unlock_irq(&wq->dfl_pwq->pool->lock); old_pwq = install_unbound_pwq(wq, cpu, wq->dfl_pwq); out_unlock: mutex_unlock(&wq->mutex); put_pwq_unlocked(old_pwq); } static int alloc_and_link_pwqs(struct workqueue_struct *wq) { bool highpri = wq->flags & WQ_HIGHPRI; int cpu, ret; wq->cpu_pwq = alloc_percpu(struct pool_workqueue *); if (!wq->cpu_pwq) goto enomem; if (!(wq->flags & WQ_UNBOUND)) { for_each_possible_cpu(cpu) { struct pool_workqueue **pwq_p = per_cpu_ptr(wq->cpu_pwq, cpu); struct worker_pool *pool = &(per_cpu_ptr(cpu_worker_pools, cpu)[highpri]); *pwq_p = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node); if (!*pwq_p) goto enomem; init_pwq(*pwq_p, wq, pool); mutex_lock(&wq->mutex); link_pwq(*pwq_p); mutex_unlock(&wq->mutex); } return 0; } cpus_read_lock(); if (wq->flags & __WQ_ORDERED) { ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]); /* there should only be single pwq for ordering guarantee */ WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node || wq->pwqs.prev != &wq->dfl_pwq->pwqs_node), "ordering guarantee broken for workqueue %s\n", wq->name); } else { ret = apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]); } cpus_read_unlock(); /* for unbound pwq, flush the pwq_release_worker ensures that the * pwq_release_workfn() completes before calling kfree(wq). */ if (ret) kthread_flush_worker(pwq_release_worker); return ret; enomem: if (wq->cpu_pwq) { for_each_possible_cpu(cpu) { struct pool_workqueue *pwq = *per_cpu_ptr(wq->cpu_pwq, cpu); if (pwq) kmem_cache_free(pwq_cache, pwq); } free_percpu(wq->cpu_pwq); wq->cpu_pwq = NULL; } return -ENOMEM; } static int wq_clamp_max_active(int max_active, unsigned int flags, const char *name) { if (max_active < 1 || max_active > WQ_MAX_ACTIVE) pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n", max_active, name, 1, WQ_MAX_ACTIVE); return clamp_val(max_active, 1, WQ_MAX_ACTIVE); } /* * Workqueues which may be used during memory reclaim should have a rescuer * to guarantee forward progress. */ static int init_rescuer(struct workqueue_struct *wq) { struct worker *rescuer; int ret; if (!(wq->flags & WQ_MEM_RECLAIM)) return 0; rescuer = alloc_worker(NUMA_NO_NODE); if (!rescuer) { pr_err("workqueue: Failed to allocate a rescuer for wq \"%s\"\n", wq->name); return -ENOMEM; } rescuer->rescue_wq = wq; rescuer->task = kthread_create(rescuer_thread, rescuer, "kworker/R-%s", wq->name); if (IS_ERR(rescuer->task)) { ret = PTR_ERR(rescuer->task); pr_err("workqueue: Failed to create a rescuer kthread for wq \"%s\": %pe", wq->name, ERR_PTR(ret)); kfree(rescuer); return ret; } wq->rescuer = rescuer; kthread_bind_mask(rescuer->task, cpu_possible_mask); wake_up_process(rescuer->task); return 0; } __printf(1, 4) struct workqueue_struct *alloc_workqueue(const char *fmt, unsigned int flags, int max_active, ...) { va_list args; struct workqueue_struct *wq; struct pool_workqueue *pwq; /* * Unbound && max_active == 1 used to imply ordered, which is no longer * the case on many machines due to per-pod pools. While * alloc_ordered_workqueue() is the right way to create an ordered * workqueue, keep the previous behavior to avoid subtle breakages. */ if ((flags & WQ_UNBOUND) && max_active == 1) flags |= __WQ_ORDERED; /* see the comment above the definition of WQ_POWER_EFFICIENT */ if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient) flags |= WQ_UNBOUND; /* allocate wq and format name */ wq = kzalloc(sizeof(*wq), GFP_KERNEL); if (!wq) return NULL; if (flags & WQ_UNBOUND) { wq->unbound_attrs = alloc_workqueue_attrs(); if (!wq->unbound_attrs) goto err_free_wq; } va_start(args, max_active); vsnprintf(wq->name, sizeof(wq->name), fmt, args); va_end(args); max_active = max_active ?: WQ_DFL_ACTIVE; max_active = wq_clamp_max_active(max_active, flags, wq->name); /* init wq */ wq->flags = flags; wq->saved_max_active = max_active; mutex_init(&wq->mutex); atomic_set(&wq->nr_pwqs_to_flush, 0); INIT_LIST_HEAD(&wq->pwqs); INIT_LIST_HEAD(&wq->flusher_queue); INIT_LIST_HEAD(&wq->flusher_overflow); INIT_LIST_HEAD(&wq->maydays); wq_init_lockdep(wq); INIT_LIST_HEAD(&wq->list); if (alloc_and_link_pwqs(wq) < 0) goto err_unreg_lockdep; if (wq_online && init_rescuer(wq) < 0) goto err_destroy; if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq)) goto err_destroy; /* * wq_pool_mutex protects global freeze state and workqueues list. * Grab it, adjust max_active and add the new @wq to workqueues * list. */ mutex_lock(&wq_pool_mutex); mutex_lock(&wq->mutex); for_each_pwq(pwq, wq) pwq_adjust_max_active(pwq); mutex_unlock(&wq->mutex); list_add_tail_rcu(&wq->list, &workqueues); mutex_unlock(&wq_pool_mutex); return wq; err_unreg_lockdep: wq_unregister_lockdep(wq); wq_free_lockdep(wq); err_free_wq: free_workqueue_attrs(wq->unbound_attrs); kfree(wq); return NULL; err_destroy: destroy_workqueue(wq); return NULL; } EXPORT_SYMBOL_GPL(alloc_workqueue); static bool pwq_busy(struct pool_workqueue *pwq) { int i; for (i = 0; i < WORK_NR_COLORS; i++) if (pwq->nr_in_flight[i]) return true; if ((pwq != pwq->wq->dfl_pwq) && (pwq->refcnt > 1)) return true; if (pwq->nr_active || !list_empty(&pwq->inactive_works)) return true; return false; } /** * destroy_workqueue - safely terminate a workqueue * @wq: target workqueue * * Safely destroy a workqueue. All work currently pending will be done first. */ void destroy_workqueue(struct workqueue_struct *wq) { struct pool_workqueue *pwq; int cpu; /* * Remove it from sysfs first so that sanity check failure doesn't * lead to sysfs name conflicts. */ workqueue_sysfs_unregister(wq); /* mark the workqueue destruction is in progress */ mutex_lock(&wq->mutex); wq->flags |= __WQ_DESTROYING; mutex_unlock(&wq->mutex); /* drain it before proceeding with destruction */ drain_workqueue(wq); /* kill rescuer, if sanity checks fail, leave it w/o rescuer */ if (wq->rescuer) { struct worker *rescuer = wq->rescuer; /* this prevents new queueing */ raw_spin_lock_irq(&wq_mayday_lock); wq->rescuer = NULL; raw_spin_unlock_irq(&wq_mayday_lock); /* rescuer will empty maydays list before exiting */ kthread_stop(rescuer->task); kfree(rescuer); } /* * Sanity checks - grab all the locks so that we wait for all * in-flight operations which may do put_pwq(). */ mutex_lock(&wq_pool_mutex); mutex_lock(&wq->mutex); for_each_pwq(pwq, wq) { raw_spin_lock_irq(&pwq->pool->lock); if (WARN_ON(pwq_busy(pwq))) { pr_warn("%s: %s has the following busy pwq\n", __func__, wq->name); show_pwq(pwq); raw_spin_unlock_irq(&pwq->pool->lock); mutex_unlock(&wq->mutex); mutex_unlock(&wq_pool_mutex); show_one_workqueue(wq); return; } raw_spin_unlock_irq(&pwq->pool->lock); } mutex_unlock(&wq->mutex); /* * wq list is used to freeze wq, remove from list after * flushing is complete in case freeze races us. */ list_del_rcu(&wq->list); mutex_unlock(&wq_pool_mutex); /* * We're the sole accessor of @wq. Directly access cpu_pwq and dfl_pwq * to put the base refs. @wq will be auto-destroyed from the last * pwq_put. RCU read lock prevents @wq from going away from under us. */ rcu_read_lock(); for_each_possible_cpu(cpu) { pwq = rcu_access_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu)); RCU_INIT_POINTER(*per_cpu_ptr(wq->cpu_pwq, cpu), NULL); put_pwq_unlocked(pwq); } put_pwq_unlocked(wq->dfl_pwq); wq->dfl_pwq = NULL; rcu_read_unlock(); } EXPORT_SYMBOL_GPL(destroy_workqueue); /** * workqueue_set_max_active - adjust max_active of a workqueue * @wq: target workqueue * @max_active: new max_active value. * * Set max_active of @wq to @max_active. * * CONTEXT: * Don't call from IRQ context. */ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) { struct pool_workqueue *pwq; /* disallow meddling with max_active for ordered workqueues */ if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT)) return; max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); mutex_lock(&wq->mutex); wq->flags &= ~__WQ_ORDERED; wq->saved_max_active = max_active; for_each_pwq(pwq, wq) pwq_adjust_max_active(pwq); mutex_unlock(&wq->mutex); } EXPORT_SYMBOL_GPL(workqueue_set_max_active); /** * current_work - retrieve %current task's work struct * * Determine if %current task is a workqueue worker and what it's working on. * Useful to find out the context that the %current task is running in. * * Return: work struct if %current task is a workqueue worker, %NULL otherwise. */ struct work_struct *current_work(void) { struct worker *worker = current_wq_worker(); return worker ? worker->current_work : NULL; } EXPORT_SYMBOL(current_work); /** * current_is_workqueue_rescuer - is %current workqueue rescuer? * * Determine whether %current is a workqueue rescuer. Can be used from * work functions to determine whether it's being run off the rescuer task. * * Return: %true if %current is a workqueue rescuer. %false otherwise. */ bool current_is_workqueue_rescuer(void) { struct worker *worker = current_wq_worker(); return worker && worker->rescue_wq; } /** * workqueue_congested - test whether a workqueue is congested * @cpu: CPU in question * @wq: target workqueue * * Test whether @wq's cpu workqueue for @cpu is congested. There is * no synchronization around this function and the test result is * unreliable and only useful as advisory hints or for debugging. * * If @cpu is WORK_CPU_UNBOUND, the test is performed on the local CPU. * * With the exception of ordered workqueues, all workqueues have per-cpu * pool_workqueues, each with its own congested state. A workqueue being * congested on one CPU doesn't mean that the workqueue is contested on any * other CPUs. * * Return: * %true if congested, %false otherwise. */ bool workqueue_congested(int cpu, struct workqueue_struct *wq) { struct pool_workqueue *pwq; bool ret; rcu_read_lock(); preempt_disable(); if (cpu == WORK_CPU_UNBOUND) cpu = smp_processor_id(); pwq = *per_cpu_ptr(wq->cpu_pwq, cpu); ret = !list_empty(&pwq->inactive_works); preempt_enable(); rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(workqueue_congested); /** * work_busy - test whether a work is currently pending or running * @work: the work to be tested * * Test whether @work is currently pending or running. There is no * synchronization around this function and the test result is * unreliable and only useful as advisory hints or for debugging. * * Return: * OR'd bitmask of WORK_BUSY_* bits. */ unsigned int work_busy(struct work_struct *work) { struct worker_pool *pool; unsigned long flags; unsigned int ret = 0; if (work_pending(work)) ret |= WORK_BUSY_PENDING; rcu_read_lock(); pool = get_work_pool(work); if (pool) { raw_spin_lock_irqsave(&pool->lock, flags); if (find_worker_executing_work(pool, work)) ret |= WORK_BUSY_RUNNING; raw_spin_unlock_irqrestore(&pool->lock, flags); } rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(work_busy); /** * set_worker_desc - set description for the current work item * @fmt: printf-style format string * @...: arguments for the format string * * This function can be called by a running work function to describe what * the work item is about. If the worker task gets dumped, this * information will be printed out together to help debugging. The * description can be at most WORKER_DESC_LEN including the trailing '\0'. */ void set_worker_desc(const char *fmt, ...) { struct worker *worker = current_wq_worker(); va_list args; if (worker) { va_start(args, fmt); vsnprintf(worker->desc, sizeof(worker->desc), fmt, args); va_end(args); } } EXPORT_SYMBOL_GPL(set_worker_desc); /** * print_worker_info - print out worker information and description * @log_lvl: the log level to use when printing * @task: target task * * If @task is a worker and currently executing a work item, print out the * name of the workqueue being serviced and worker description set with * set_worker_desc() by the currently executing work item. * * This function can be safely called on any task as long as the * task_struct itself is accessible. While safe, this function isn't * synchronized and may print out mixups or garbages of limited length. */ void print_worker_info(const char *log_lvl, struct task_struct *task) { work_func_t *fn = NULL; char name[WQ_NAME_LEN] = { }; char desc[WORKER_DESC_LEN] = { }; struct pool_workqueue *pwq = NULL; struct workqueue_struct *wq = NULL; struct worker *worker; if (!(task->flags & PF_WQ_WORKER)) return; /* * This function is called without any synchronization and @task * could be in any state. Be careful with dereferences. */ worker = kthread_probe_data(task); /* * Carefully copy the associated workqueue's workfn, name and desc. * Keep the original last '\0' in case the original is garbage. */ copy_from_kernel_nofault(&fn, &worker->current_func, sizeof(fn)); copy_from_kernel_nofault(&pwq, &worker->current_pwq, sizeof(pwq)); copy_from_kernel_nofault(&wq, &pwq->wq, sizeof(wq)); copy_from_kernel_nofault(name, wq->name, sizeof(name) - 1); copy_from_kernel_nofault(desc, worker->desc, sizeof(desc) - 1); if (fn || name[0] || desc[0]) { printk("%sWorkqueue: %s %ps", log_lvl, name, fn); if (strcmp(name, desc)) pr_cont(" (%s)", desc); pr_cont("\n"); } } static void pr_cont_pool_info(struct worker_pool *pool) { pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask); if (pool->node != NUMA_NO_NODE) pr_cont(" node=%d", pool->node); pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice); } struct pr_cont_work_struct { bool comma; work_func_t func; long ctr; }; static void pr_cont_work_flush(bool comma, work_func_t func, struct pr_cont_work_struct *pcwsp) { if (!pcwsp->ctr) goto out_record; if (func == pcwsp->func) { pcwsp->ctr++; return; } if (pcwsp->ctr == 1) pr_cont("%s %ps", pcwsp->comma ? "," : "", pcwsp->func); else pr_cont("%s %ld*%ps", pcwsp->comma ? "," : "", pcwsp->ctr, pcwsp->func); pcwsp->ctr = 0; out_record: if ((long)func == -1L) return; pcwsp->comma = comma; pcwsp->func = func; pcwsp->ctr = 1; } static void pr_cont_work(bool comma, struct work_struct *work, struct pr_cont_work_struct *pcwsp) { if (work->func == wq_barrier_func) { struct wq_barrier *barr; barr = container_of(work, struct wq_barrier, work); pr_cont_work_flush(comma, (work_func_t)-1, pcwsp); pr_cont("%s BAR(%d)", comma ? "," : "", task_pid_nr(barr->task)); } else { if (!comma) pr_cont_work_flush(comma, (work_func_t)-1, pcwsp); pr_cont_work_flush(comma, work->func, pcwsp); } } static void show_pwq(struct pool_workqueue *pwq) { struct pr_cont_work_struct pcws = { .ctr = 0, }; struct worker_pool *pool = pwq->pool; struct work_struct *work; struct worker *worker; bool has_in_flight = false, has_pending = false; int bkt; pr_info(" pwq %d:", pool->id); pr_cont_pool_info(pool); pr_cont(" active=%d/%d refcnt=%d%s\n", pwq->nr_active, pwq->max_active, pwq->refcnt, !list_empty(&pwq->mayday_node) ? " MAYDAY" : ""); hash_for_each(pool->busy_hash, bkt, worker, hentry) { if (worker->current_pwq == pwq) { has_in_flight = true; break; } } if (has_in_flight) { bool comma = false; pr_info(" in-flight:"); hash_for_each(pool->busy_hash, bkt, worker, hentry) { if (worker->current_pwq != pwq) continue; pr_cont("%s %d%s:%ps", comma ? "," : "", task_pid_nr(worker->task), worker->rescue_wq ? "(RESCUER)" : "", worker->current_func); list_for_each_entry(work, &worker->scheduled, entry) pr_cont_work(false, work, &pcws); pr_cont_work_flush(comma, (work_func_t)-1L, &pcws); comma = true; } pr_cont("\n"); } list_for_each_entry(work, &pool->worklist, entry) { if (get_work_pwq(work) == pwq) { has_pending = true; break; } } if (has_pending) { bool comma = false; pr_info(" pending:"); list_for_each_entry(work, &pool->worklist, entry) { if (get_work_pwq(work) != pwq) continue; pr_cont_work(comma, work, &pcws); comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); } pr_cont_work_flush(comma, (work_func_t)-1L, &pcws); pr_cont("\n"); } if (!list_empty(&pwq->inactive_works)) { bool comma = false; pr_info(" inactive:"); list_for_each_entry(work, &pwq->inactive_works, entry) { pr_cont_work(comma, work, &pcws); comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); } pr_cont_work_flush(comma, (work_func_t)-1L, &pcws); pr_cont("\n"); } } /** * show_one_workqueue - dump state of specified workqueue * @wq: workqueue whose state will be printed */ void show_one_workqueue(struct workqueue_struct *wq) { struct pool_workqueue *pwq; bool idle = true; unsigned long flags; for_each_pwq(pwq, wq) { if (pwq->nr_active || !list_empty(&pwq->inactive_works)) { idle = false; break; } } if (idle) /* Nothing to print for idle workqueue */ return; pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags); for_each_pwq(pwq, wq) { raw_spin_lock_irqsave(&pwq->pool->lock, flags); if (pwq->nr_active || !list_empty(&pwq->inactive_works)) { /* * Defer printing to avoid deadlocks in console * drivers that queue work while holding locks * also taken in their write paths. */ printk_deferred_enter(); show_pwq(pwq); printk_deferred_exit(); } raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); /* * We could be printing a lot from atomic context, e.g. * sysrq-t -> show_all_workqueues(). Avoid triggering * hard lockup. */ touch_nmi_watchdog(); } } /** * show_one_worker_pool - dump state of specified worker pool * @pool: worker pool whose state will be printed */ static void show_one_worker_pool(struct worker_pool *pool) { struct worker *worker; bool first = true; unsigned long flags; unsigned long hung = 0; raw_spin_lock_irqsave(&pool->lock, flags); if (pool->nr_workers == pool->nr_idle) goto next_pool; /* How long the first pending work is waiting for a worker. */ if (!list_empty(&pool->worklist)) hung = jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000; /* * Defer printing to avoid deadlocks in console drivers that * queue work while holding locks also taken in their write * paths. */ printk_deferred_enter(); pr_info("pool %d:", pool->id); pr_cont_pool_info(pool); pr_cont(" hung=%lus workers=%d", hung, pool->nr_workers); if (pool->manager) pr_cont(" manager: %d", task_pid_nr(pool->manager->task)); list_for_each_entry(worker, &pool->idle_list, entry) { pr_cont(" %s%d", first ? "idle: " : "", task_pid_nr(worker->task)); first = false; } pr_cont("\n"); printk_deferred_exit(); next_pool: raw_spin_unlock_irqrestore(&pool->lock, flags); /* * We could be printing a lot from atomic context, e.g. * sysrq-t -> show_all_workqueues(). Avoid triggering * hard lockup. */ touch_nmi_watchdog(); } /** * show_all_workqueues - dump workqueue state * * Called from a sysrq handler and prints out all busy workqueues and pools. */ void show_all_workqueues(void) { struct workqueue_struct *wq; struct worker_pool *pool; int pi; rcu_read_lock(); pr_info("Showing busy workqueues and worker pools:\n"); list_for_each_entry_rcu(wq, &workqueues, list) show_one_workqueue(wq); for_each_pool(pool, pi) show_one_worker_pool(pool); rcu_read_unlock(); } /** * show_freezable_workqueues - dump freezable workqueue state * * Called from try_to_freeze_tasks() and prints out all freezable workqueues * still busy. */ void show_freezable_workqueues(void) { struct workqueue_struct *wq; rcu_read_lock(); pr_info("Showing freezable workqueues that are still busy:\n"); list_for_each_entry_rcu(wq, &workqueues, list) { if (!(wq->flags & WQ_FREEZABLE)) continue; show_one_workqueue(wq); } rcu_read_unlock(); } /* used to show worker information through /proc/PID/{comm,stat,status} */ void wq_worker_comm(char *buf, size_t size, struct task_struct *task) { int off; /* always show the actual comm */ off = strscpy(buf, task->comm, size); if (off < 0) return; /* stabilize PF_WQ_WORKER and worker pool association */ mutex_lock(&wq_pool_attach_mutex); if (task->flags & PF_WQ_WORKER) { struct worker *worker = kthread_data(task); struct worker_pool *pool = worker->pool; if (pool) { raw_spin_lock_irq(&pool->lock); /* * ->desc tracks information (wq name or * set_worker_desc()) for the latest execution. If * current, prepend '+', otherwise '-'. */ if (worker->desc[0] != '\0') { if (worker->current_work) scnprintf(buf + off, size - off, "+%s", worker->desc); else scnprintf(buf + off, size - off, "-%s", worker->desc); } raw_spin_unlock_irq(&pool->lock); } } mutex_unlock(&wq_pool_attach_mutex); } #ifdef CONFIG_SMP /* * CPU hotplug. * * There are two challenges in supporting CPU hotplug. Firstly, there * are a lot of assumptions on strong associations among work, pwq and * pool which make migrating pending and scheduled works very * difficult to implement without impacting hot paths. Secondly, * worker pools serve mix of short, long and very long running works making * blocked draining impractical. * * This is solved by allowing the pools to be disassociated from the CPU * running as an unbound one and allowing it to be reattached later if the * cpu comes back online. */ static void unbind_workers(int cpu) { struct worker_pool *pool; struct worker *worker; for_each_cpu_worker_pool(pool, cpu) { mutex_lock(&wq_pool_attach_mutex); raw_spin_lock_irq(&pool->lock); /* * We've blocked all attach/detach operations. Make all workers * unbound and set DISASSOCIATED. Before this, all workers * must be on the cpu. After this, they may become diasporas. * And the preemption disabled section in their sched callbacks * are guaranteed to see WORKER_UNBOUND since the code here * is on the same cpu. */ for_each_pool_worker(worker, pool) worker->flags |= WORKER_UNBOUND; pool->flags |= POOL_DISASSOCIATED; /* * The handling of nr_running in sched callbacks are disabled * now. Zap nr_running. After this, nr_running stays zero and * need_more_worker() and keep_working() are always true as * long as the worklist is not empty. This pool now behaves as * an unbound (in terms of concurrency management) pool which * are served by workers tied to the pool. */ pool->nr_running = 0; /* * With concurrency management just turned off, a busy * worker blocking could lead to lengthy stalls. Kick off * unbound chain execution of currently pending work items. */ kick_pool(pool); raw_spin_unlock_irq(&pool->lock); for_each_pool_worker(worker, pool) unbind_worker(worker); mutex_unlock(&wq_pool_attach_mutex); } } /** * rebind_workers - rebind all workers of a pool to the associated CPU * @pool: pool of interest * * @pool->cpu is coming online. Rebind all workers to the CPU. */ static void rebind_workers(struct worker_pool *pool) { struct worker *worker; lockdep_assert_held(&wq_pool_attach_mutex); /* * Restore CPU affinity of all workers. As all idle workers should * be on the run-queue of the associated CPU before any local * wake-ups for concurrency management happen, restore CPU affinity * of all workers first and then clear UNBOUND. As we're called * from CPU_ONLINE, the following shouldn't fail. */ for_each_pool_worker(worker, pool) { kthread_set_per_cpu(worker->task, pool->cpu); WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, pool_allowed_cpus(pool)) < 0); } raw_spin_lock_irq(&pool->lock); pool->flags &= ~POOL_DISASSOCIATED; for_each_pool_worker(worker, pool) { unsigned int worker_flags = worker->flags; /* * We want to clear UNBOUND but can't directly call * worker_clr_flags() or adjust nr_running. Atomically * replace UNBOUND with another NOT_RUNNING flag REBOUND. * @worker will clear REBOUND using worker_clr_flags() when * it initiates the next execution cycle thus restoring * concurrency management. Note that when or whether * @worker clears REBOUND doesn't affect correctness. * * WRITE_ONCE() is necessary because @worker->flags may be * tested without holding any lock in * wq_worker_running(). Without it, NOT_RUNNING test may * fail incorrectly leading to premature concurrency * management operations. */ WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND)); worker_flags |= WORKER_REBOUND; worker_flags &= ~WORKER_UNBOUND; WRITE_ONCE(worker->flags, worker_flags); } raw_spin_unlock_irq(&pool->lock); } /** * restore_unbound_workers_cpumask - restore cpumask of unbound workers * @pool: unbound pool of interest * @cpu: the CPU which is coming up * * An unbound pool may end up with a cpumask which doesn't have any online * CPUs. When a worker of such pool get scheduled, the scheduler resets * its cpus_allowed. If @cpu is in @pool's cpumask which didn't have any * online CPU before, cpus_allowed of all its workers should be restored. */ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) { static cpumask_t cpumask; struct worker *worker; lockdep_assert_held(&wq_pool_attach_mutex); /* is @cpu allowed for @pool? */ if (!cpumask_test_cpu(cpu, pool->attrs->cpumask)) return; cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask); /* as we're called from CPU_ONLINE, the following shouldn't fail */ for_each_pool_worker(worker, pool) WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, &cpumask) < 0); } int workqueue_prepare_cpu(unsigned int cpu) { struct worker_pool *pool; for_each_cpu_worker_pool(pool, cpu) { if (pool->nr_workers) continue; if (!create_worker(pool)) return -ENOMEM; } return 0; } int workqueue_online_cpu(unsigned int cpu) { struct worker_pool *pool; struct workqueue_struct *wq; int pi; mutex_lock(&wq_pool_mutex); for_each_pool(pool, pi) { mutex_lock(&wq_pool_attach_mutex); if (pool->cpu == cpu) rebind_workers(pool); else if (pool->cpu < 0) restore_unbound_workers_cpumask(pool, cpu); mutex_unlock(&wq_pool_attach_mutex); } /* update pod affinity of unbound workqueues */ list_for_each_entry(wq, &workqueues, list) { struct workqueue_attrs *attrs = wq->unbound_attrs; if (attrs) { const struct wq_pod_type *pt = wqattrs_pod_type(attrs); int tcpu; for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]]) wq_update_pod(wq, tcpu, cpu, true); } } mutex_unlock(&wq_pool_mutex); return 0; } int workqueue_offline_cpu(unsigned int cpu) { struct workqueue_struct *wq; /* unbinding per-cpu workers should happen on the local CPU */ if (WARN_ON(cpu != smp_processor_id())) return -1; unbind_workers(cpu); /* update pod affinity of unbound workqueues */ mutex_lock(&wq_pool_mutex); list_for_each_entry(wq, &workqueues, list) { struct workqueue_attrs *attrs = wq->unbound_attrs; if (attrs) { const struct wq_pod_type *pt = wqattrs_pod_type(attrs); int tcpu; for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]]) wq_update_pod(wq, tcpu, cpu, false); } } mutex_unlock(&wq_pool_mutex); return 0; } struct work_for_cpu { struct work_struct work; long (*fn)(void *); void *arg; long ret; }; static void work_for_cpu_fn(struct work_struct *work) { struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work); wfc->ret = wfc->fn(wfc->arg); } /** * work_on_cpu - run a function in thread context on a particular cpu * @cpu: the cpu to run on * @fn: the function to run * @arg: the function arg * * It is up to the caller to ensure that the cpu doesn't go offline. * The caller must not hold any locks which would prevent @fn from completing. * * Return: The value @fn returns. */ long work_on_cpu(int cpu, long (*fn)(void *), void *arg) { struct work_for_cpu wfc = { .fn = fn, .arg = arg }; INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); schedule_work_on(cpu, &wfc.work); flush_work(&wfc.work); destroy_work_on_stack(&wfc.work); return wfc.ret; } EXPORT_SYMBOL_GPL(work_on_cpu); /** * work_on_cpu_safe - run a function in thread context on a particular cpu * @cpu: the cpu to run on * @fn: the function to run * @arg: the function argument * * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold * any locks which would prevent @fn from completing. * * Return: The value @fn returns. */ long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg) { long ret = -ENODEV; cpus_read_lock(); if (cpu_online(cpu)) ret = work_on_cpu(cpu, fn, arg); cpus_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(work_on_cpu_safe); #endif /* CONFIG_SMP */ #ifdef CONFIG_FREEZER /** * freeze_workqueues_begin - begin freezing workqueues * * Start freezing workqueues. After this function returns, all freezable * workqueues will queue new works to their inactive_works list instead of * pool->worklist. * * CONTEXT: * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's. */ void freeze_workqueues_begin(void) { struct workqueue_struct *wq; struct pool_workqueue *pwq; mutex_lock(&wq_pool_mutex); WARN_ON_ONCE(workqueue_freezing); workqueue_freezing = true; list_for_each_entry(wq, &workqueues, list) { mutex_lock(&wq->mutex); for_each_pwq(pwq, wq) pwq_adjust_max_active(pwq); mutex_unlock(&wq->mutex); } mutex_unlock(&wq_pool_mutex); } /** * freeze_workqueues_busy - are freezable workqueues still busy? * * Check whether freezing is complete. This function must be called * between freeze_workqueues_begin() and thaw_workqueues(). * * CONTEXT: * Grabs and releases wq_pool_mutex. * * Return: * %true if some freezable workqueues are still busy. %false if freezing * is complete. */ bool freeze_workqueues_busy(void) { bool busy = false; struct workqueue_struct *wq; struct pool_workqueue *pwq; mutex_lock(&wq_pool_mutex); WARN_ON_ONCE(!workqueue_freezing); list_for_each_entry(wq, &workqueues, list) { if (!(wq->flags & WQ_FREEZABLE)) continue; /* * nr_active is monotonically decreasing. It's safe * to peek without lock. */ rcu_read_lock(); for_each_pwq(pwq, wq) { WARN_ON_ONCE(pwq->nr_active < 0); if (pwq->nr_active) { busy = true; rcu_read_unlock(); goto out_unlock; } } rcu_read_unlock(); } out_unlock: mutex_unlock(&wq_pool_mutex); return busy; } /** * thaw_workqueues - thaw workqueues * * Thaw workqueues. Normal queueing is restored and all collected * frozen works are transferred to their respective pool worklists. * * CONTEXT: * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's. */ void thaw_workqueues(void) { struct workqueue_struct *wq; struct pool_workqueue *pwq; mutex_lock(&wq_pool_mutex); if (!workqueue_freezing) goto out_unlock; workqueue_freezing = false; /* restore max_active and repopulate worklist */ list_for_each_entry(wq, &workqueues, list) { mutex_lock(&wq->mutex); for_each_pwq(pwq, wq) pwq_adjust_max_active(pwq); mutex_unlock(&wq->mutex); } out_unlock: mutex_unlock(&wq_pool_mutex); } #endif /* CONFIG_FREEZER */ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask) { LIST_HEAD(ctxs); int ret = 0; struct workqueue_struct *wq; struct apply_wqattrs_ctx *ctx, *n; lockdep_assert_held(&wq_pool_mutex); list_for_each_entry(wq, &workqueues, list) { if (!(wq->flags & WQ_UNBOUND)) continue; /* creating multiple pwqs breaks ordering guarantee */ if (!list_empty(&wq->pwqs)) { if (wq->flags & __WQ_ORDERED_EXPLICIT) continue; wq->flags &= ~__WQ_ORDERED; } ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs, unbound_cpumask); if (IS_ERR(ctx)) { ret = PTR_ERR(ctx); break; } list_add_tail(&ctx->list, &ctxs); } list_for_each_entry_safe(ctx, n, &ctxs, list) { if (!ret) apply_wqattrs_commit(ctx); apply_wqattrs_cleanup(ctx); } if (!ret) { mutex_lock(&wq_pool_attach_mutex); cpumask_copy(wq_unbound_cpumask, unbound_cpumask); mutex_unlock(&wq_pool_attach_mutex); } return ret; } /** * workqueue_set_unbound_cpumask - Set the low-level unbound cpumask * @cpumask: the cpumask to set * * The low-level workqueues cpumask is a global cpumask that limits * the affinity of all unbound workqueues. This function check the @cpumask * and apply it to all unbound workqueues and updates all pwqs of them. * * Return: 0 - Success * -EINVAL - Invalid @cpumask * -ENOMEM - Failed to allocate memory for attrs or pwqs. */ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) { int ret = -EINVAL; /* * Not excluding isolated cpus on purpose. * If the user wishes to include them, we allow that. */ cpumask_and(cpumask, cpumask, cpu_possible_mask); if (!cpumask_empty(cpumask)) { apply_wqattrs_lock(); if (cpumask_equal(cpumask, wq_unbound_cpumask)) { ret = 0; goto out_unlock; } ret = workqueue_apply_unbound_cpumask(cpumask); out_unlock: apply_wqattrs_unlock(); } return ret; } static int parse_affn_scope(const char *val) { int i; for (i = 0; i < ARRAY_SIZE(wq_affn_names); i++) { if (!strncasecmp(val, wq_affn_names[i], strlen(wq_affn_names[i]))) return i; } return -EINVAL; } static int wq_affn_dfl_set(const char *val, const struct kernel_param *kp) { struct workqueue_struct *wq; int affn, cpu; affn = parse_affn_scope(val); if (affn < 0) return affn; if (affn == WQ_AFFN_DFL) return -EINVAL; cpus_read_lock(); mutex_lock(&wq_pool_mutex); wq_affn_dfl = affn; list_for_each_entry(wq, &workqueues, list) { for_each_online_cpu(cpu) { wq_update_pod(wq, cpu, cpu, true); } } mutex_unlock(&wq_pool_mutex); cpus_read_unlock(); return 0; } static int wq_affn_dfl_get(char *buffer, const struct kernel_param *kp) { return scnprintf(buffer, PAGE_SIZE, "%s\n", wq_affn_names[wq_affn_dfl]); } static const struct kernel_param_ops wq_affn_dfl_ops = { .set = wq_affn_dfl_set, .get = wq_affn_dfl_get, }; module_param_cb(default_affinity_scope, &wq_affn_dfl_ops, NULL, 0644); #ifdef CONFIG_SYSFS /* * Workqueues with WQ_SYSFS flag set is visible to userland via * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the * following attributes. * * per_cpu RO bool : whether the workqueue is per-cpu or unbound * max_active RW int : maximum number of in-flight work items * * Unbound workqueues have the following extra attributes. * * nice RW int : nice value of the workers * cpumask RW mask : bitmask of allowed CPUs for the workers * affinity_scope RW str : worker CPU affinity scope (cache, numa, none) * affinity_strict RW bool : worker CPU affinity is strict */ struct wq_device { struct workqueue_struct *wq; struct device dev; }; static struct workqueue_struct *dev_to_wq(struct device *dev) { struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); return wq_dev->wq; } static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr, char *buf) { struct workqueue_struct *wq = dev_to_wq(dev); return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); } static DEVICE_ATTR_RO(per_cpu); static ssize_t max_active_show(struct device *dev, struct device_attribute *attr, char *buf) { struct workqueue_struct *wq = dev_to_wq(dev); return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); } static ssize_t max_active_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct workqueue_struct *wq = dev_to_wq(dev); int val; if (sscanf(buf, "%d", &val) != 1 || val <= 0) return -EINVAL; workqueue_set_max_active(wq, val); return count; } static DEVICE_ATTR_RW(max_active); static struct attribute *wq_sysfs_attrs[] = { &dev_attr_per_cpu.attr, &dev_attr_max_active.attr, NULL, }; ATTRIBUTE_GROUPS(wq_sysfs); static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, char *buf) { struct workqueue_struct *wq = dev_to_wq(dev); int written; mutex_lock(&wq->mutex); written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice); mutex_unlock(&wq->mutex); return written; } /* prepare workqueue_attrs for sysfs store operations */ static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq) { struct workqueue_attrs *attrs; lockdep_assert_held(&wq_pool_mutex); attrs = alloc_workqueue_attrs(); if (!attrs) return NULL; copy_workqueue_attrs(attrs, wq->unbound_attrs); return attrs; } static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct workqueue_struct *wq = dev_to_wq(dev); struct workqueue_attrs *attrs; int ret = -ENOMEM; apply_wqattrs_lock(); attrs = wq_sysfs_prep_attrs(wq); if (!attrs) goto out_unlock; if (sscanf(buf, "%d", &attrs->nice) == 1 && attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE) ret = apply_workqueue_attrs_locked(wq, attrs); else ret = -EINVAL; out_unlock: apply_wqattrs_unlock(); free_workqueue_attrs(attrs); return ret ?: count; } static ssize_t wq_cpumask_show(struct device *dev, struct device_attribute *attr, char *buf) { struct workqueue_struct *wq = dev_to_wq(dev); int written; mutex_lock(&wq->mutex); written = scnprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(wq->unbound_attrs->cpumask)); mutex_unlock(&wq->mutex); return written; } static ssize_t wq_cpumask_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct workqueue_struct *wq = dev_to_wq(dev); struct workqueue_attrs *attrs; int ret = -ENOMEM; apply_wqattrs_lock(); attrs = wq_sysfs_prep_attrs(wq); if (!attrs) goto out_unlock; ret = cpumask_parse(buf, attrs->cpumask); if (!ret) ret = apply_workqueue_attrs_locked(wq, attrs); out_unlock: apply_wqattrs_unlock(); free_workqueue_attrs(attrs); return ret ?: count; } static ssize_t wq_affn_scope_show(struct device *dev, struct device_attribute *attr, char *buf) { struct workqueue_struct *wq = dev_to_wq(dev); int written; mutex_lock(&wq->mutex); if (wq->unbound_attrs->affn_scope == WQ_AFFN_DFL) written = scnprintf(buf, PAGE_SIZE, "%s (%s)\n", wq_affn_names[WQ_AFFN_DFL], wq_affn_names[wq_affn_dfl]); else written = scnprintf(buf, PAGE_SIZE, "%s\n", wq_affn_names[wq->unbound_attrs->affn_scope]); mutex_unlock(&wq->mutex); return written; } static ssize_t wq_affn_scope_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct workqueue_struct *wq = dev_to_wq(dev); struct workqueue_attrs *attrs; int affn, ret = -ENOMEM; affn = parse_affn_scope(buf); if (affn < 0) return affn; apply_wqattrs_lock(); attrs = wq_sysfs_prep_attrs(wq); if (attrs) { attrs->affn_scope = affn; ret = apply_workqueue_attrs_locked(wq, attrs); } apply_wqattrs_unlock(); free_workqueue_attrs(attrs); return ret ?: count; } static ssize_t wq_affinity_strict_show(struct device *dev, struct device_attribute *attr, char *buf) { struct workqueue_struct *wq = dev_to_wq(dev); return scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->affn_strict); } static ssize_t wq_affinity_strict_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct workqueue_struct *wq = dev_to_wq(dev); struct workqueue_attrs *attrs; int v, ret = -ENOMEM; if (sscanf(buf, "%d", &v) != 1) return -EINVAL; apply_wqattrs_lock(); attrs = wq_sysfs_prep_attrs(wq); if (attrs) { attrs->affn_strict = (bool)v; ret = apply_workqueue_attrs_locked(wq, attrs); } apply_wqattrs_unlock(); free_workqueue_attrs(attrs); return ret ?: count; } static struct device_attribute wq_sysfs_unbound_attrs[] = { __ATTR(nice, 0644, wq_nice_show, wq_nice_store), __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), __ATTR(affinity_scope, 0644, wq_affn_scope_show, wq_affn_scope_store), __ATTR(affinity_strict, 0644, wq_affinity_strict_show, wq_affinity_strict_store), __ATTR_NULL, }; static struct bus_type wq_subsys = { .name = "workqueue", .dev_groups = wq_sysfs_groups, }; static ssize_t wq_unbound_cpumask_show(struct device *dev, struct device_attribute *attr, char *buf) { int written; mutex_lock(&wq_pool_mutex); written = scnprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(wq_unbound_cpumask)); mutex_unlock(&wq_pool_mutex); return written; } static ssize_t wq_unbound_cpumask_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { cpumask_var_t cpumask; int ret; if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL)) return -ENOMEM; ret = cpumask_parse(buf, cpumask); if (!ret) ret = workqueue_set_unbound_cpumask(cpumask); free_cpumask_var(cpumask); return ret ? ret : count; } static struct device_attribute wq_sysfs_cpumask_attr = __ATTR(cpumask, 0644, wq_unbound_cpumask_show, wq_unbound_cpumask_store); static int __init wq_sysfs_init(void) { struct device *dev_root; int err; err = subsys_virtual_register(&wq_subsys, NULL); if (err) return err; dev_root = bus_get_dev_root(&wq_subsys); if (dev_root) { err = device_create_file(dev_root, &wq_sysfs_cpumask_attr); put_device(dev_root); } return err; } core_initcall(wq_sysfs_init); static void wq_device_release(struct device *dev) { struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); kfree(wq_dev); } /** * workqueue_sysfs_register - make a workqueue visible in sysfs * @wq: the workqueue to register * * Expose @wq in sysfs under /sys/bus/workqueue/devices. * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set * which is the preferred method. * * Workqueue user should use this function directly iff it wants to apply * workqueue_attrs before making the workqueue visible in sysfs; otherwise, * apply_workqueue_attrs() may race against userland updating the * attributes. * * Return: 0 on success, -errno on failure. */ int workqueue_sysfs_register(struct workqueue_struct *wq) { struct wq_device *wq_dev; int ret; /* * Adjusting max_active or creating new pwqs by applying * attributes breaks ordering guarantee. Disallow exposing ordered * workqueues. */ if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT)) return -EINVAL; wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL); if (!wq_dev) return -ENOMEM; wq_dev->wq = wq; wq_dev->dev.bus = &wq_subsys; wq_dev->dev.release = wq_device_release; dev_set_name(&wq_dev->dev, "%s", wq->name); /* * unbound_attrs are created separately. Suppress uevent until * everything is ready. */ dev_set_uevent_suppress(&wq_dev->dev, true); ret = device_register(&wq_dev->dev); if (ret) { put_device(&wq_dev->dev); wq->wq_dev = NULL; return ret; } if (wq->flags & WQ_UNBOUND) { struct device_attribute *attr; for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) { ret = device_create_file(&wq_dev->dev, attr); if (ret) { device_unregister(&wq_dev->dev); wq->wq_dev = NULL; return ret; } } } dev_set_uevent_suppress(&wq_dev->dev, false); kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD); return 0; } /** * workqueue_sysfs_unregister - undo workqueue_sysfs_register() * @wq: the workqueue to unregister * * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister. */ static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { struct wq_device *wq_dev = wq->wq_dev; if (!wq->wq_dev) return; wq->wq_dev = NULL; device_unregister(&wq_dev->dev); } #else /* CONFIG_SYSFS */ static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { } #endif /* CONFIG_SYSFS */ /* * Workqueue watchdog. * * Stall may be caused by various bugs - missing WQ_MEM_RECLAIM, illegal * flush dependency, a concurrency managed work item which stays RUNNING * indefinitely. Workqueue stalls can be very difficult to debug as the * usual warning mechanisms don't trigger and internal workqueue state is * largely opaque. * * Workqueue watchdog monitors all worker pools periodically and dumps * state if some pools failed to make forward progress for a while where * forward progress is defined as the first item on ->worklist changing. * * This mechanism is controlled through the kernel parameter * "workqueue.watchdog_thresh" which can be updated at runtime through the * corresponding sysfs parameter file. */ #ifdef CONFIG_WQ_WATCHDOG static unsigned long wq_watchdog_thresh = 30; static struct timer_list wq_watchdog_timer; static unsigned long wq_watchdog_touched = INITIAL_JIFFIES; static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES; /* * Show workers that might prevent the processing of pending work items. * The only candidates are CPU-bound workers in the running state. * Pending work items should be handled by another idle worker * in all other situations. */ static void show_cpu_pool_hog(struct worker_pool *pool) { struct worker *worker; unsigned long flags; int bkt; raw_spin_lock_irqsave(&pool->lock, flags); hash_for_each(pool->busy_hash, bkt, worker, hentry) { if (task_is_running(worker->task)) { /* * Defer printing to avoid deadlocks in console * drivers that queue work while holding locks * also taken in their write paths. */ printk_deferred_enter(); pr_info("pool %d:\n", pool->id); sched_show_task(worker->task); printk_deferred_exit(); } } raw_spin_unlock_irqrestore(&pool->lock, flags); } static void show_cpu_pools_hogs(void) { struct worker_pool *pool; int pi; pr_info("Showing backtraces of running workers in stalled CPU-bound worker pools:\n"); rcu_read_lock(); for_each_pool(pool, pi) { if (pool->cpu_stall) show_cpu_pool_hog(pool); } rcu_read_unlock(); } static void wq_watchdog_reset_touched(void) { int cpu; wq_watchdog_touched = jiffies; for_each_possible_cpu(cpu) per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies; } static void wq_watchdog_timer_fn(struct timer_list *unused) { unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ; bool lockup_detected = false; bool cpu_pool_stall = false; unsigned long now = jiffies; struct worker_pool *pool; int pi; if (!thresh) return; rcu_read_lock(); for_each_pool(pool, pi) { unsigned long pool_ts, touched, ts; pool->cpu_stall = false; if (list_empty(&pool->worklist)) continue; /* * If a virtual machine is stopped by the host it can look to * the watchdog like a stall. */ kvm_check_and_clear_guest_paused(); /* get the latest of pool and touched timestamps */ if (pool->cpu >= 0) touched = READ_ONCE(per_cpu(wq_watchdog_touched_cpu, pool->cpu)); else touched = READ_ONCE(wq_watchdog_touched); pool_ts = READ_ONCE(pool->watchdog_ts); if (time_after(pool_ts, touched)) ts = pool_ts; else ts = touched; /* did we stall? */ if (time_after(now, ts + thresh)) { lockup_detected = true; if (pool->cpu >= 0) { pool->cpu_stall = true; cpu_pool_stall = true; } pr_emerg("BUG: workqueue lockup - pool"); pr_cont_pool_info(pool); pr_cont(" stuck for %us!\n", jiffies_to_msecs(now - pool_ts) / 1000); } } rcu_read_unlock(); if (lockup_detected) show_all_workqueues(); if (cpu_pool_stall) show_cpu_pools_hogs(); wq_watchdog_reset_touched(); mod_timer(&wq_watchdog_timer, jiffies + thresh); } notrace void wq_watchdog_touch(int cpu) { if (cpu >= 0) per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies; wq_watchdog_touched = jiffies; } static void wq_watchdog_set_thresh(unsigned long thresh) { wq_watchdog_thresh = 0; del_timer_sync(&wq_watchdog_timer); if (thresh) { wq_watchdog_thresh = thresh; wq_watchdog_reset_touched(); mod_timer(&wq_watchdog_timer, jiffies + thresh * HZ); } } static int wq_watchdog_param_set_thresh(const char *val, const struct kernel_param *kp) { unsigned long thresh; int ret; ret = kstrtoul(val, 0, &thresh); if (ret) return ret; if (system_wq) wq_watchdog_set_thresh(thresh); else wq_watchdog_thresh = thresh; return 0; } static const struct kernel_param_ops wq_watchdog_thresh_ops = { .set = wq_watchdog_param_set_thresh, .get = param_get_ulong, }; module_param_cb(watchdog_thresh, &wq_watchdog_thresh_ops, &wq_watchdog_thresh, 0644); static void wq_watchdog_init(void) { timer_setup(&wq_watchdog_timer, wq_watchdog_timer_fn, TIMER_DEFERRABLE); wq_watchdog_set_thresh(wq_watchdog_thresh); } #else /* CONFIG_WQ_WATCHDOG */ static inline void wq_watchdog_init(void) { } #endif /* CONFIG_WQ_WATCHDOG */ /** * workqueue_init_early - early init for workqueue subsystem * * This is the first step of three-staged workqueue subsystem initialization and * invoked as soon as the bare basics - memory allocation, cpumasks and idr are * up. It sets up all the data structures and system workqueues and allows early * boot code to create workqueues and queue/cancel work items. Actual work item * execution starts only after kthreads can be created and scheduled right * before early initcalls. */ void __init workqueue_init_early(void) { struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_SYSTEM]; int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; int i, cpu; BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_WQ)); cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN)); if (!cpumask_empty(&wq_cmdline_cpumask)) cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, &wq_cmdline_cpumask); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); wq_update_pod_attrs_buf = alloc_workqueue_attrs(); BUG_ON(!wq_update_pod_attrs_buf); /* initialize WQ_AFFN_SYSTEM pods */ pt->pod_cpus = kcalloc(1, sizeof(pt->pod_cpus[0]), GFP_KERNEL); pt->pod_node = kcalloc(1, sizeof(pt->pod_node[0]), GFP_KERNEL); pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL); BUG_ON(!pt->pod_cpus || !pt->pod_node || !pt->cpu_pod); BUG_ON(!zalloc_cpumask_var_node(&pt->pod_cpus[0], GFP_KERNEL, NUMA_NO_NODE)); pt->nr_pods = 1; cpumask_copy(pt->pod_cpus[0], cpu_possible_mask); pt->pod_node[0] = NUMA_NO_NODE; pt->cpu_pod[0] = 0; /* initialize CPU pools */ for_each_possible_cpu(cpu) { struct worker_pool *pool; i = 0; for_each_cpu_worker_pool(pool, cpu) { BUG_ON(init_worker_pool(pool)); pool->cpu = cpu; cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu)); cpumask_copy(pool->attrs->__pod_cpumask, cpumask_of(cpu)); pool->attrs->nice = std_nice[i++]; pool->attrs->affn_strict = true; pool->node = cpu_to_node(cpu); /* alloc pool ID */ mutex_lock(&wq_pool_mutex); BUG_ON(worker_pool_assign_id(pool)); mutex_unlock(&wq_pool_mutex); } } /* create default unbound and ordered wq attrs */ for (i = 0; i < NR_STD_WORKER_POOLS; i++) { struct workqueue_attrs *attrs; BUG_ON(!(attrs = alloc_workqueue_attrs())); attrs->nice = std_nice[i]; unbound_std_wq_attrs[i] = attrs; /* * An ordered wq should have only one pwq as ordering is * guaranteed by max_active which is enforced by pwqs. */ BUG_ON(!(attrs = alloc_workqueue_attrs())); attrs->nice = std_nice[i]; attrs->ordered = true; ordered_wq_attrs[i] = attrs; } system_wq = alloc_workqueue("events", 0, 0); system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0); system_long_wq = alloc_workqueue("events_long", 0, 0); system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE); system_freezable_wq = alloc_workqueue("events_freezable", WQ_FREEZABLE, 0); system_power_efficient_wq = alloc_workqueue("events_power_efficient", WQ_POWER_EFFICIENT, 0); system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient", WQ_FREEZABLE | WQ_POWER_EFFICIENT, 0); BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq || !system_unbound_wq || !system_freezable_wq || !system_power_efficient_wq || !system_freezable_power_efficient_wq); } static void __init wq_cpu_intensive_thresh_init(void) { unsigned long thresh; unsigned long bogo; pwq_release_worker = kthread_create_worker(0, "pool_workqueue_release"); BUG_ON(IS_ERR(pwq_release_worker)); /* if the user set it to a specific value, keep it */ if (wq_cpu_intensive_thresh_us != ULONG_MAX) return; /* * The default of 10ms is derived from the fact that most modern (as of * 2023) processors can do a lot in 10ms and that it's just below what * most consider human-perceivable. However, the kernel also runs on a * lot slower CPUs including microcontrollers where the threshold is way * too low. * * Let's scale up the threshold upto 1 second if BogoMips is below 4000. * This is by no means accurate but it doesn't have to be. The mechanism * is still useful even when the threshold is fully scaled up. Also, as * the reports would usually be applicable to everyone, some machines * operating on longer thresholds won't significantly diminish their * usefulness. */ thresh = 10 * USEC_PER_MSEC; /* see init/calibrate.c for lpj -> BogoMIPS calculation */ bogo = max_t(unsigned long, loops_per_jiffy / 500000 * HZ, 1); if (bogo < 4000) thresh = min_t(unsigned long, thresh * 4000 / bogo, USEC_PER_SEC); pr_debug("wq_cpu_intensive_thresh: lpj=%lu BogoMIPS=%lu thresh_us=%lu\n", loops_per_jiffy, bogo, thresh); wq_cpu_intensive_thresh_us = thresh; } /** * workqueue_init - bring workqueue subsystem fully online * * This is the second step of three-staged workqueue subsystem initialization * and invoked as soon as kthreads can be created and scheduled. Workqueues have * been created and work items queued on them, but there are no kworkers * executing the work items yet. Populate the worker pools with the initial * workers and enable future kworker creations. */ void __init workqueue_init(void) { struct workqueue_struct *wq; struct worker_pool *pool; int cpu, bkt; wq_cpu_intensive_thresh_init(); mutex_lock(&wq_pool_mutex); /* * Per-cpu pools created earlier could be missing node hint. Fix them * up. Also, create a rescuer for workqueues that requested it. */ for_each_possible_cpu(cpu) { for_each_cpu_worker_pool(pool, cpu) { pool->node = cpu_to_node(cpu); } } list_for_each_entry(wq, &workqueues, list) { WARN(init_rescuer(wq), "workqueue: failed to create early rescuer for %s", wq->name); } mutex_unlock(&wq_pool_mutex); /* create the initial workers */ for_each_online_cpu(cpu) { for_each_cpu_worker_pool(pool, cpu) { pool->flags &= ~POOL_DISASSOCIATED; BUG_ON(!create_worker(pool)); } } hash_for_each(unbound_pool_hash, bkt, pool, hash_node) BUG_ON(!create_worker(pool)); wq_online = true; wq_watchdog_init(); } /* * Initialize @pt by first initializing @pt->cpu_pod[] with pod IDs according to * @cpu_shares_pod(). Each subset of CPUs that share a pod is assigned a unique * and consecutive pod ID. The rest of @pt is initialized accordingly. */ static void __init init_pod_type(struct wq_pod_type *pt, bool (*cpus_share_pod)(int, int)) { int cur, pre, cpu, pod; pt->nr_pods = 0; /* init @pt->cpu_pod[] according to @cpus_share_pod() */ pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL); BUG_ON(!pt->cpu_pod); for_each_possible_cpu(cur) { for_each_possible_cpu(pre) { if (pre >= cur) { pt->cpu_pod[cur] = pt->nr_pods++; break; } if (cpus_share_pod(cur, pre)) { pt->cpu_pod[cur] = pt->cpu_pod[pre]; break; } } } /* init the rest to match @pt->cpu_pod[] */ pt->pod_cpus = kcalloc(pt->nr_pods, sizeof(pt->pod_cpus[0]), GFP_KERNEL); pt->pod_node = kcalloc(pt->nr_pods, sizeof(pt->pod_node[0]), GFP_KERNEL); BUG_ON(!pt->pod_cpus || !pt->pod_node); for (pod = 0; pod < pt->nr_pods; pod++) BUG_ON(!zalloc_cpumask_var(&pt->pod_cpus[pod], GFP_KERNEL)); for_each_possible_cpu(cpu) { cpumask_set_cpu(cpu, pt->pod_cpus[pt->cpu_pod[cpu]]); pt->pod_node[pt->cpu_pod[cpu]] = cpu_to_node(cpu); } } static bool __init cpus_dont_share(int cpu0, int cpu1) { return false; } static bool __init cpus_share_smt(int cpu0, int cpu1) { #ifdef CONFIG_SCHED_SMT return cpumask_test_cpu(cpu0, cpu_smt_mask(cpu1)); #else return false; #endif } static bool __init cpus_share_numa(int cpu0, int cpu1) { return cpu_to_node(cpu0) == cpu_to_node(cpu1); } /** * workqueue_init_topology - initialize CPU pods for unbound workqueues * * This is the third step of there-staged workqueue subsystem initialization and * invoked after SMP and topology information are fully initialized. It * initializes the unbound CPU pods accordingly. */ void __init workqueue_init_topology(void) { struct workqueue_struct *wq; int cpu; init_pod_type(&wq_pod_types[WQ_AFFN_CPU], cpus_dont_share); init_pod_type(&wq_pod_types[WQ_AFFN_SMT], cpus_share_smt); init_pod_type(&wq_pod_types[WQ_AFFN_CACHE], cpus_share_cache); init_pod_type(&wq_pod_types[WQ_AFFN_NUMA], cpus_share_numa); mutex_lock(&wq_pool_mutex); /* * Workqueues allocated earlier would have all CPUs sharing the default * worker pool. Explicitly call wq_update_pod() on all workqueue and CPU * combinations to apply per-pod sharing. */ list_for_each_entry(wq, &workqueues, list) { for_each_online_cpu(cpu) { wq_update_pod(wq, cpu, cpu, true); } } mutex_unlock(&wq_pool_mutex); } void __warn_flushing_systemwide_wq(void) { pr_warn("WARNING: Flushing system-wide workqueues will be prohibited in near future.\n"); dump_stack(); } EXPORT_SYMBOL(__warn_flushing_systemwide_wq); static int __init workqueue_unbound_cpus_setup(char *str) { if (cpulist_parse(str, &wq_cmdline_cpumask) < 0) { cpumask_clear(&wq_cmdline_cpumask); pr_warn("workqueue.unbound_cpus: incorrect CPU range, using default\n"); } return 1; } __setup("workqueue.unbound_cpus=", workqueue_unbound_cpus_setup); |
3390 658 585 3261 3390 822 822 875 585 2100 154 1951 585 585 585 585 585 73 585 86 86 86 291 290 154 1951 1662 291 29 290 2100 2100 3129 2985 2489 1538 3390 2100 86 582 582 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 | // SPDX-License-Identifier: GPL-2.0 /* * buffered writeback throttling. loosely based on CoDel. We can't drop * packets for IO scheduling, so the logic is something like this: * * - Monitor latencies in a defined window of time. * - If the minimum latency in the above window exceeds some target, increment * scaling step and scale down queue depth by a factor of 2x. The monitoring * window is then shrunk to 100 / sqrt(scaling step + 1). * - For any window where we don't have solid data on what the latencies * look like, retain status quo. * - If latencies look good, decrement scaling step. * - If we're only doing writes, allow the scaling step to go negative. This * will temporarily boost write performance, snapping back to a stable * scaling step of 0 if reads show up or the heavy writers finish. Unlike * positive scaling steps where we shrink the monitoring window, a negative * scaling step retains the default step==0 window size. * * Copyright (C) 2016 Jens Axboe * */ #include <linux/kernel.h> #include <linux/blk_types.h> #include <linux/slab.h> #include <linux/backing-dev.h> #include <linux/swap.h> #include "blk-stat.h" #include "blk-wbt.h" #include "blk-rq-qos.h" #include "elevator.h" #define CREATE_TRACE_POINTS #include <trace/events/wbt.h> enum wbt_flags { WBT_TRACKED = 1, /* write, tracked for throttling */ WBT_READ = 2, /* read */ WBT_KSWAPD = 4, /* write, from kswapd */ WBT_DISCARD = 8, /* discard */ WBT_NR_BITS = 4, /* number of bits */ }; enum { WBT_RWQ_BG = 0, WBT_RWQ_KSWAPD, WBT_RWQ_DISCARD, WBT_NUM_RWQ, }; /* * If current state is WBT_STATE_ON/OFF_DEFAULT, it can be covered to any other * state, if current state is WBT_STATE_ON/OFF_MANUAL, it can only be covered * to WBT_STATE_OFF/ON_MANUAL. */ enum { WBT_STATE_ON_DEFAULT = 1, /* on by default */ WBT_STATE_ON_MANUAL = 2, /* on manually by sysfs */ WBT_STATE_OFF_DEFAULT = 3, /* off by default */ WBT_STATE_OFF_MANUAL = 4, /* off manually by sysfs */ }; struct rq_wb { /* * Settings that govern how we throttle */ unsigned int wb_background; /* background writeback */ unsigned int wb_normal; /* normal writeback */ short enable_state; /* WBT_STATE_* */ /* * Number of consecutive periods where we don't have enough * information to make a firm scale up/down decision. */ unsigned int unknown_cnt; u64 win_nsec; /* default window size */ u64 cur_win_nsec; /* current window size */ struct blk_stat_callback *cb; u64 sync_issue; void *sync_cookie; unsigned int wc; unsigned long last_issue; /* last non-throttled issue */ unsigned long last_comp; /* last non-throttled comp */ unsigned long min_lat_nsec; struct rq_qos rqos; struct rq_wait rq_wait[WBT_NUM_RWQ]; struct rq_depth rq_depth; }; static inline struct rq_wb *RQWB(struct rq_qos *rqos) { return container_of(rqos, struct rq_wb, rqos); } static inline void wbt_clear_state(struct request *rq) { rq->wbt_flags = 0; } static inline enum wbt_flags wbt_flags(struct request *rq) { return rq->wbt_flags; } static inline bool wbt_is_tracked(struct request *rq) { return rq->wbt_flags & WBT_TRACKED; } static inline bool wbt_is_read(struct request *rq) { return rq->wbt_flags & WBT_READ; } enum { /* * Default setting, we'll scale up (to 75% of QD max) or down (min 1) * from here depending on device stats */ RWB_DEF_DEPTH = 16, /* * 100msec window */ RWB_WINDOW_NSEC = 100 * 1000 * 1000ULL, /* * Disregard stats, if we don't meet this minimum */ RWB_MIN_WRITE_SAMPLES = 3, /* * If we have this number of consecutive windows with not enough * information to scale up or down, scale up. */ RWB_UNKNOWN_BUMP = 5, }; static inline bool rwb_enabled(struct rq_wb *rwb) { return rwb && rwb->enable_state != WBT_STATE_OFF_DEFAULT && rwb->enable_state != WBT_STATE_OFF_MANUAL; } static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) { if (rwb_enabled(rwb)) { const unsigned long cur = jiffies; if (cur != *var) *var = cur; } } /* * If a task was rate throttled in balance_dirty_pages() within the last * second or so, use that to indicate a higher cleaning rate. */ static bool wb_recent_wait(struct rq_wb *rwb) { struct bdi_writeback *wb = &rwb->rqos.disk->bdi->wb; return time_before(jiffies, wb->dirty_sleep + HZ); } static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, enum wbt_flags wb_acct) { if (wb_acct & WBT_KSWAPD) return &rwb->rq_wait[WBT_RWQ_KSWAPD]; else if (wb_acct & WBT_DISCARD) return &rwb->rq_wait[WBT_RWQ_DISCARD]; return &rwb->rq_wait[WBT_RWQ_BG]; } static void rwb_wake_all(struct rq_wb *rwb) { int i; for (i = 0; i < WBT_NUM_RWQ; i++) { struct rq_wait *rqw = &rwb->rq_wait[i]; if (wq_has_sleeper(&rqw->wait)) wake_up_all(&rqw->wait); } } static void wbt_rqw_done(struct rq_wb *rwb, struct rq_wait *rqw, enum wbt_flags wb_acct) { int inflight, limit; inflight = atomic_dec_return(&rqw->inflight); /* * For discards, our limit is always the background. For writes, if * the device does write back caching, drop further down before we * wake people up. */ if (wb_acct & WBT_DISCARD) limit = rwb->wb_background; else if (rwb->wc && !wb_recent_wait(rwb)) limit = 0; else limit = rwb->wb_normal; /* * Don't wake anyone up if we are above the normal limit. */ if (inflight && inflight >= limit) return; if (wq_has_sleeper(&rqw->wait)) { int diff = limit - inflight; if (!inflight || diff >= rwb->wb_background / 2) wake_up_all(&rqw->wait); } } static void __wbt_done(struct rq_qos *rqos, enum wbt_flags wb_acct) { struct rq_wb *rwb = RQWB(rqos); struct rq_wait *rqw; if (!(wb_acct & WBT_TRACKED)) return; rqw = get_rq_wait(rwb, wb_acct); wbt_rqw_done(rwb, rqw, wb_acct); } /* * Called on completion of a request. Note that it's also called when * a request is merged, when the request gets freed. */ static void wbt_done(struct rq_qos *rqos, struct request *rq) { struct rq_wb *rwb = RQWB(rqos); if (!wbt_is_tracked(rq)) { if (rwb->sync_cookie == rq) { rwb->sync_issue = 0; rwb->sync_cookie = NULL; } if (wbt_is_read(rq)) wb_timestamp(rwb, &rwb->last_comp); } else { WARN_ON_ONCE(rq == rwb->sync_cookie); __wbt_done(rqos, wbt_flags(rq)); } wbt_clear_state(rq); } static inline bool stat_sample_valid(struct blk_rq_stat *stat) { /* * We need at least one read sample, and a minimum of * RWB_MIN_WRITE_SAMPLES. We require some write samples to know * that it's writes impacting us, and not just some sole read on * a device that is in a lower power state. */ return (stat[READ].nr_samples >= 1 && stat[WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES); } static u64 rwb_sync_issue_lat(struct rq_wb *rwb) { u64 now, issue = READ_ONCE(rwb->sync_issue); if (!issue || !rwb->sync_cookie) return 0; now = ktime_to_ns(ktime_get()); return now - issue; } static inline unsigned int wbt_inflight(struct rq_wb *rwb) { unsigned int i, ret = 0; for (i = 0; i < WBT_NUM_RWQ; i++) ret += atomic_read(&rwb->rq_wait[i].inflight); return ret; } enum { LAT_OK = 1, LAT_UNKNOWN, LAT_UNKNOWN_WRITES, LAT_EXCEEDED, }; static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) { struct backing_dev_info *bdi = rwb->rqos.disk->bdi; struct rq_depth *rqd = &rwb->rq_depth; u64 thislat; /* * If our stored sync issue exceeds the window size, or it * exceeds our min target AND we haven't logged any entries, * flag the latency as exceeded. wbt works off completion latencies, * but for a flooded device, a single sync IO can take a long time * to complete after being issued. If this time exceeds our * monitoring window AND we didn't see any other completions in that * window, then count that sync IO as a violation of the latency. */ thislat = rwb_sync_issue_lat(rwb); if (thislat > rwb->cur_win_nsec || (thislat > rwb->min_lat_nsec && !stat[READ].nr_samples)) { trace_wbt_lat(bdi, thislat); return LAT_EXCEEDED; } /* * No read/write mix, if stat isn't valid */ if (!stat_sample_valid(stat)) { /* * If we had writes in this stat window and the window is * current, we're only doing writes. If a task recently * waited or still has writes in flights, consider us doing * just writes as well. */ if (stat[WRITE].nr_samples || wb_recent_wait(rwb) || wbt_inflight(rwb)) return LAT_UNKNOWN_WRITES; return LAT_UNKNOWN; } /* * If the 'min' latency exceeds our target, step down. */ if (stat[READ].min > rwb->min_lat_nsec) { trace_wbt_lat(bdi, stat[READ].min); trace_wbt_stat(bdi, stat); return LAT_EXCEEDED; } if (rqd->scale_step) trace_wbt_stat(bdi, stat); return LAT_OK; } static void rwb_trace_step(struct rq_wb *rwb, const char *msg) { struct backing_dev_info *bdi = rwb->rqos.disk->bdi; struct rq_depth *rqd = &rwb->rq_depth; trace_wbt_step(bdi, msg, rqd->scale_step, rwb->cur_win_nsec, rwb->wb_background, rwb->wb_normal, rqd->max_depth); } static void calc_wb_limits(struct rq_wb *rwb) { if (rwb->min_lat_nsec == 0) { rwb->wb_normal = rwb->wb_background = 0; } else if (rwb->rq_depth.max_depth <= 2) { rwb->wb_normal = rwb->rq_depth.max_depth; rwb->wb_background = 1; } else { rwb->wb_normal = (rwb->rq_depth.max_depth + 1) / 2; rwb->wb_background = (rwb->rq_depth.max_depth + 3) / 4; } } static void scale_up(struct rq_wb *rwb) { if (!rq_depth_scale_up(&rwb->rq_depth)) return; calc_wb_limits(rwb); rwb->unknown_cnt = 0; rwb_wake_all(rwb); rwb_trace_step(rwb, tracepoint_string("scale up")); } static void scale_down(struct rq_wb *rwb, bool hard_throttle) { if (!rq_depth_scale_down(&rwb->rq_depth, hard_throttle)) return; calc_wb_limits(rwb); rwb->unknown_cnt = 0; rwb_trace_step(rwb, tracepoint_string("scale down")); } static void rwb_arm_timer(struct rq_wb *rwb) { struct rq_depth *rqd = &rwb->rq_depth; if (rqd->scale_step > 0) { /* * We should speed this up, using some variant of a fast * integer inverse square root calculation. Since we only do * this for every window expiration, it's not a huge deal, * though. */ rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4, int_sqrt((rqd->scale_step + 1) << 8)); } else { /* * For step < 0, we don't want to increase/decrease the * window size. */ rwb->cur_win_nsec = rwb->win_nsec; } blk_stat_activate_nsecs(rwb->cb, rwb->cur_win_nsec); } static void wb_timer_fn(struct blk_stat_callback *cb) { struct rq_wb *rwb = cb->data; struct rq_depth *rqd = &rwb->rq_depth; unsigned int inflight = wbt_inflight(rwb); int status; if (!rwb->rqos.disk) return; status = latency_exceeded(rwb, cb->stat); trace_wbt_timer(rwb->rqos.disk->bdi, status, rqd->scale_step, inflight); /* * If we exceeded the latency target, step down. If we did not, * step one level up. If we don't know enough to say either exceeded * or ok, then don't do anything. */ switch (status) { case LAT_EXCEEDED: scale_down(rwb, true); break; case LAT_OK: scale_up(rwb); break; case LAT_UNKNOWN_WRITES: /* * We started a the center step, but don't have a valid * read/write sample, but we do have writes going on. * Allow step to go negative, to increase write perf. */ scale_up(rwb); break; case LAT_UNKNOWN: if (++rwb->unknown_cnt < RWB_UNKNOWN_BUMP) break; /* * We get here when previously scaled reduced depth, and we * currently don't have a valid read/write sample. For that * case, slowly return to center state (step == 0). */ if (rqd->scale_step > 0) scale_up(rwb); else if (rqd->scale_step < 0) scale_down(rwb, false); break; default: break; } /* * Re-arm timer, if we have IO in flight */ if (rqd->scale_step || inflight) rwb_arm_timer(rwb); } static void wbt_update_limits(struct rq_wb *rwb) { struct rq_depth *rqd = &rwb->rq_depth; rqd->scale_step = 0; rqd->scaled_max = false; rq_depth_calc_max_depth(rqd); calc_wb_limits(rwb); rwb_wake_all(rwb); } bool wbt_disabled(struct request_queue *q) { struct rq_qos *rqos = wbt_rq_qos(q); return !rqos || !rwb_enabled(RQWB(rqos)); } u64 wbt_get_min_lat(struct request_queue *q) { struct rq_qos *rqos = wbt_rq_qos(q); if (!rqos) return 0; return RQWB(rqos)->min_lat_nsec; } void wbt_set_min_lat(struct request_queue *q, u64 val) { struct rq_qos *rqos = wbt_rq_qos(q); if (!rqos) return; RQWB(rqos)->min_lat_nsec = val; if (val) RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL; else RQWB(rqos)->enable_state = WBT_STATE_OFF_MANUAL; wbt_update_limits(RQWB(rqos)); } static bool close_io(struct rq_wb *rwb) { const unsigned long now = jiffies; return time_before(now, rwb->last_issue + HZ / 10) || time_before(now, rwb->last_comp + HZ / 10); } #define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO) static inline unsigned int get_limit(struct rq_wb *rwb, blk_opf_t opf) { unsigned int limit; if ((opf & REQ_OP_MASK) == REQ_OP_DISCARD) return rwb->wb_background; /* * At this point we know it's a buffered write. If this is * kswapd trying to free memory, or REQ_SYNC is set, then * it's WB_SYNC_ALL writeback, and we'll use the max limit for * that. If the write is marked as a background write, then use * the idle limit, or go to normal if we haven't had competing * IO for a bit. */ if ((opf & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd()) limit = rwb->rq_depth.max_depth; else if ((opf & REQ_BACKGROUND) || close_io(rwb)) { /* * If less than 100ms since we completed unrelated IO, * limit us to half the depth for background writeback. */ limit = rwb->wb_background; } else limit = rwb->wb_normal; return limit; } struct wbt_wait_data { struct rq_wb *rwb; enum wbt_flags wb_acct; blk_opf_t opf; }; static bool wbt_inflight_cb(struct rq_wait *rqw, void *private_data) { struct wbt_wait_data *data = private_data; return rq_wait_inc_below(rqw, get_limit(data->rwb, data->opf)); } static void wbt_cleanup_cb(struct rq_wait *rqw, void *private_data) { struct wbt_wait_data *data = private_data; wbt_rqw_done(data->rwb, rqw, data->wb_acct); } /* * Block if we will exceed our limit, or if we are currently waiting for * the timer to kick off queuing again. */ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, blk_opf_t opf) { struct rq_wait *rqw = get_rq_wait(rwb, wb_acct); struct wbt_wait_data data = { .rwb = rwb, .wb_acct = wb_acct, .opf = opf, }; rq_qos_wait(rqw, &data, wbt_inflight_cb, wbt_cleanup_cb); } static inline bool wbt_should_throttle(struct bio *bio) { switch (bio_op(bio)) { case REQ_OP_WRITE: /* * Don't throttle WRITE_ODIRECT */ if ((bio->bi_opf & (REQ_SYNC | REQ_IDLE)) == (REQ_SYNC | REQ_IDLE)) return false; fallthrough; case REQ_OP_DISCARD: return true; default: return false; } } static enum wbt_flags bio_to_wbt_flags(struct rq_wb *rwb, struct bio *bio) { enum wbt_flags flags = 0; if (!rwb_enabled(rwb)) return 0; if (bio_op(bio) == REQ_OP_READ) { flags = WBT_READ; } else if (wbt_should_throttle(bio)) { if (current_is_kswapd()) flags |= WBT_KSWAPD; if (bio_op(bio) == REQ_OP_DISCARD) flags |= WBT_DISCARD; flags |= WBT_TRACKED; } return flags; } static void wbt_cleanup(struct rq_qos *rqos, struct bio *bio) { struct rq_wb *rwb = RQWB(rqos); enum wbt_flags flags = bio_to_wbt_flags(rwb, bio); __wbt_done(rqos, flags); } /* * May sleep, if we have exceeded the writeback limits. Caller can pass * in an irq held spinlock, if it holds one when calling this function. * If we do sleep, we'll release and re-grab it. */ static void wbt_wait(struct rq_qos *rqos, struct bio *bio) { struct rq_wb *rwb = RQWB(rqos); enum wbt_flags flags; flags = bio_to_wbt_flags(rwb, bio); if (!(flags & WBT_TRACKED)) { if (flags & WBT_READ) wb_timestamp(rwb, &rwb->last_issue); return; } __wbt_wait(rwb, flags, bio->bi_opf); if (!blk_stat_is_active(rwb->cb)) rwb_arm_timer(rwb); } static void wbt_track(struct rq_qos *rqos, struct request *rq, struct bio *bio) { struct rq_wb *rwb = RQWB(rqos); rq->wbt_flags |= bio_to_wbt_flags(rwb, bio); } static void wbt_issue(struct rq_qos *rqos, struct request *rq) { struct rq_wb *rwb = RQWB(rqos); if (!rwb_enabled(rwb)) return; /* * Track sync issue, in case it takes a long time to complete. Allows us * to react quicker, if a sync IO takes a long time to complete. Note * that this is just a hint. The request can go away when it completes, * so it's important we never dereference it. We only use the address to * compare with, which is why we store the sync_issue time locally. */ if (wbt_is_read(rq) && !rwb->sync_issue) { rwb->sync_cookie = rq; rwb->sync_issue = rq->io_start_time_ns; } } static void wbt_requeue(struct rq_qos *rqos, struct request *rq) { struct rq_wb *rwb = RQWB(rqos); if (!rwb_enabled(rwb)) return; if (rq == rwb->sync_cookie) { rwb->sync_issue = 0; rwb->sync_cookie = NULL; } } void wbt_set_write_cache(struct request_queue *q, bool write_cache_on) { struct rq_qos *rqos = wbt_rq_qos(q); if (rqos) RQWB(rqos)->wc = write_cache_on; } /* * Enable wbt if defaults are configured that way */ void wbt_enable_default(struct gendisk *disk) { struct request_queue *q = disk->queue; struct rq_qos *rqos; bool enable = IS_ENABLED(CONFIG_BLK_WBT_MQ); if (q->elevator && test_bit(ELEVATOR_FLAG_DISABLE_WBT, &q->elevator->flags)) enable = false; /* Throttling already enabled? */ rqos = wbt_rq_qos(q); if (rqos) { if (enable && RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT) RQWB(rqos)->enable_state = WBT_STATE_ON_DEFAULT; return; } /* Queue not registered? Maybe shutting down... */ if (!blk_queue_registered(q)) return; if (queue_is_mq(q) && enable) wbt_init(disk); } EXPORT_SYMBOL_GPL(wbt_enable_default); u64 wbt_default_latency_nsec(struct request_queue *q) { /* * We default to 2msec for non-rotational storage, and 75msec * for rotational storage. */ if (blk_queue_nonrot(q)) return 2000000ULL; else return 75000000ULL; } static int wbt_data_dir(const struct request *rq) { const enum req_op op = req_op(rq); if (op == REQ_OP_READ) return READ; else if (op_is_write(op)) return WRITE; /* don't account */ return -1; } static void wbt_queue_depth_changed(struct rq_qos *rqos) { RQWB(rqos)->rq_depth.queue_depth = blk_queue_depth(rqos->disk->queue); wbt_update_limits(RQWB(rqos)); } static void wbt_exit(struct rq_qos *rqos) { struct rq_wb *rwb = RQWB(rqos); blk_stat_remove_callback(rqos->disk->queue, rwb->cb); blk_stat_free_callback(rwb->cb); kfree(rwb); } /* * Disable wbt, if enabled by default. */ void wbt_disable_default(struct gendisk *disk) { struct rq_qos *rqos = wbt_rq_qos(disk->queue); struct rq_wb *rwb; if (!rqos) return; rwb = RQWB(rqos); if (rwb->enable_state == WBT_STATE_ON_DEFAULT) { blk_stat_deactivate(rwb->cb); rwb->enable_state = WBT_STATE_OFF_DEFAULT; } } EXPORT_SYMBOL_GPL(wbt_disable_default); #ifdef CONFIG_BLK_DEBUG_FS static int wbt_curr_win_nsec_show(void *data, struct seq_file *m) { struct rq_qos *rqos = data; struct rq_wb *rwb = RQWB(rqos); seq_printf(m, "%llu\n", rwb->cur_win_nsec); return 0; } static int wbt_enabled_show(void *data, struct seq_file *m) { struct rq_qos *rqos = data; struct rq_wb *rwb = RQWB(rqos); seq_printf(m, "%d\n", rwb->enable_state); return 0; } static int wbt_id_show(void *data, struct seq_file *m) { struct rq_qos *rqos = data; seq_printf(m, "%u\n", rqos->id); return 0; } static int wbt_inflight_show(void *data, struct seq_file *m) { struct rq_qos *rqos = data; struct rq_wb *rwb = RQWB(rqos); int i; for (i = 0; i < WBT_NUM_RWQ; i++) seq_printf(m, "%d: inflight %d\n", i, atomic_read(&rwb->rq_wait[i].inflight)); return 0; } static int wbt_min_lat_nsec_show(void *data, struct seq_file *m) { struct rq_qos *rqos = data; struct rq_wb *rwb = RQWB(rqos); seq_printf(m, "%lu\n", rwb->min_lat_nsec); return 0; } static int wbt_unknown_cnt_show(void *data, struct seq_file *m) { struct rq_qos *rqos = data; struct rq_wb *rwb = RQWB(rqos); seq_printf(m, "%u\n", rwb->unknown_cnt); return 0; } static int wbt_normal_show(void *data, struct seq_file *m) { struct rq_qos *rqos = data; struct rq_wb *rwb = RQWB(rqos); seq_printf(m, "%u\n", rwb->wb_normal); return 0; } static int wbt_background_show(void *data, struct seq_file *m) { struct rq_qos *rqos = data; struct rq_wb *rwb = RQWB(rqos); seq_printf(m, "%u\n", rwb->wb_background); return 0; } static const struct blk_mq_debugfs_attr wbt_debugfs_attrs[] = { {"curr_win_nsec", 0400, wbt_curr_win_nsec_show}, {"enabled", 0400, wbt_enabled_show}, {"id", 0400, wbt_id_show}, {"inflight", 0400, wbt_inflight_show}, {"min_lat_nsec", 0400, wbt_min_lat_nsec_show}, {"unknown_cnt", 0400, wbt_unknown_cnt_show}, {"wb_normal", 0400, wbt_normal_show}, {"wb_background", 0400, wbt_background_show}, {}, }; #endif static const struct rq_qos_ops wbt_rqos_ops = { .throttle = wbt_wait, .issue = wbt_issue, .track = wbt_track, .requeue = wbt_requeue, .done = wbt_done, .cleanup = wbt_cleanup, .queue_depth_changed = wbt_queue_depth_changed, .exit = wbt_exit, #ifdef CONFIG_BLK_DEBUG_FS .debugfs_attrs = wbt_debugfs_attrs, #endif }; int wbt_init(struct gendisk *disk) { struct request_queue *q = disk->queue; struct rq_wb *rwb; int i; int ret; rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); if (!rwb) return -ENOMEM; rwb->cb = blk_stat_alloc_callback(wb_timer_fn, wbt_data_dir, 2, rwb); if (!rwb->cb) { kfree(rwb); return -ENOMEM; } for (i = 0; i < WBT_NUM_RWQ; i++) rq_wait_init(&rwb->rq_wait[i]); rwb->last_comp = rwb->last_issue = jiffies; rwb->win_nsec = RWB_WINDOW_NSEC; rwb->enable_state = WBT_STATE_ON_DEFAULT; rwb->wc = test_bit(QUEUE_FLAG_WC, &q->queue_flags); rwb->rq_depth.default_depth = RWB_DEF_DEPTH; rwb->min_lat_nsec = wbt_default_latency_nsec(q); rwb->rq_depth.queue_depth = blk_queue_depth(q); wbt_update_limits(rwb); /* * Assign rwb and add the stats callback. */ mutex_lock(&q->rq_qos_mutex); ret = rq_qos_add(&rwb->rqos, disk, RQ_QOS_WBT, &wbt_rqos_ops); mutex_unlock(&q->rq_qos_mutex); if (ret) goto err_free; blk_stat_add_callback(q, rwb->cb); return 0; err_free: blk_stat_free_callback(rwb->cb); kfree(rwb); return ret; } |
1 96 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright 2006, Johannes Berg <johannes@sipsolutions.net> */ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/leds.h> #include "ieee80211_i.h" #define MAC80211_BLINK_DELAY 50 /* ms */ static inline void ieee80211_led_rx(struct ieee80211_local *local) { #ifdef CONFIG_MAC80211_LEDS if (!atomic_read(&local->rx_led_active)) return; led_trigger_blink_oneshot(&local->rx_led, MAC80211_BLINK_DELAY, MAC80211_BLINK_DELAY, 0); #endif } static inline void ieee80211_led_tx(struct ieee80211_local *local) { #ifdef CONFIG_MAC80211_LEDS if (!atomic_read(&local->tx_led_active)) return; led_trigger_blink_oneshot(&local->tx_led, MAC80211_BLINK_DELAY, MAC80211_BLINK_DELAY, 0); #endif } #ifdef CONFIG_MAC80211_LEDS void ieee80211_led_assoc(struct ieee80211_local *local, bool associated); void ieee80211_led_radio(struct ieee80211_local *local, bool enabled); void ieee80211_alloc_led_names(struct ieee80211_local *local); void ieee80211_free_led_names(struct ieee80211_local *local); void ieee80211_led_init(struct ieee80211_local *local); void ieee80211_led_exit(struct ieee80211_local *local); void ieee80211_mod_tpt_led_trig(struct ieee80211_local *local, unsigned int types_on, unsigned int types_off); #else static inline void ieee80211_led_assoc(struct ieee80211_local *local, bool associated) { } static inline void ieee80211_led_radio(struct ieee80211_local *local, bool enabled) { } static inline void ieee80211_alloc_led_names(struct ieee80211_local *local) { } static inline void ieee80211_free_led_names(struct ieee80211_local *local) { } static inline void ieee80211_led_init(struct ieee80211_local *local) { } static inline void ieee80211_led_exit(struct ieee80211_local *local) { } static inline void ieee80211_mod_tpt_led_trig(struct ieee80211_local *local, unsigned int types_on, unsigned int types_off) { } #endif static inline void ieee80211_tpt_led_trig_tx(struct ieee80211_local *local, int bytes) { #ifdef CONFIG_MAC80211_LEDS if (atomic_read(&local->tpt_led_active)) local->tpt_led_trigger->tx_bytes += bytes; #endif } static inline void ieee80211_tpt_led_trig_rx(struct ieee80211_local *local, int bytes) { #ifdef CONFIG_MAC80211_LEDS if (atomic_read(&local->tpt_led_active)) local->tpt_led_trigger->rx_bytes += bytes; #endif } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NF_CONNTRACK_TIMEOUT_H #define _NF_CONNTRACK_TIMEOUT_H #include <net/net_namespace.h> #include <linux/netfilter/nf_conntrack_common.h> #include <linux/netfilter/nf_conntrack_tuple_common.h> #include <linux/refcount.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_extend.h> #define CTNL_TIMEOUT_NAME_MAX 32 struct nf_ct_timeout { __u16 l3num; const struct nf_conntrack_l4proto *l4proto; char data[]; }; struct nf_conn_timeout { struct nf_ct_timeout __rcu *timeout; }; static inline unsigned int * nf_ct_timeout_data(const struct nf_conn_timeout *t) { #ifdef CONFIG_NF_CONNTRACK_TIMEOUT struct nf_ct_timeout *timeout; timeout = rcu_dereference(t->timeout); if (timeout == NULL) return NULL; return (unsigned int *)timeout->data; #else return NULL; #endif } static inline struct nf_conn_timeout *nf_ct_timeout_find(const struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_TIMEOUT return nf_ct_ext_find(ct, NF_CT_EXT_TIMEOUT); #else return NULL; #endif } static inline struct nf_conn_timeout *nf_ct_timeout_ext_add(struct nf_conn *ct, struct nf_ct_timeout *timeout, gfp_t gfp) { #ifdef CONFIG_NF_CONNTRACK_TIMEOUT struct nf_conn_timeout *timeout_ext; timeout_ext = nf_ct_ext_add(ct, NF_CT_EXT_TIMEOUT, gfp); if (timeout_ext == NULL) return NULL; rcu_assign_pointer(timeout_ext->timeout, timeout); return timeout_ext; #else return NULL; #endif }; static inline unsigned int *nf_ct_timeout_lookup(const struct nf_conn *ct) { unsigned int *timeouts = NULL; #ifdef CONFIG_NF_CONNTRACK_TIMEOUT struct nf_conn_timeout *timeout_ext; timeout_ext = nf_ct_timeout_find(ct); if (timeout_ext) timeouts = nf_ct_timeout_data(timeout_ext); #endif return timeouts; } #ifdef CONFIG_NF_CONNTRACK_TIMEOUT void nf_ct_untimeout(struct net *net, struct nf_ct_timeout *timeout); int nf_ct_set_timeout(struct net *net, struct nf_conn *ct, u8 l3num, u8 l4num, const char *timeout_name); void nf_ct_destroy_timeout(struct nf_conn *ct); #else static inline int nf_ct_set_timeout(struct net *net, struct nf_conn *ct, u8 l3num, u8 l4num, const char *timeout_name) { return -EOPNOTSUPP; } static inline void nf_ct_destroy_timeout(struct nf_conn *ct) { return; } #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ #ifdef CONFIG_NF_CONNTRACK_TIMEOUT struct nf_ct_timeout_hooks { struct nf_ct_timeout *(*timeout_find_get)(struct net *net, const char *name); void (*timeout_put)(struct nf_ct_timeout *timeout); }; extern const struct nf_ct_timeout_hooks __rcu *nf_ct_timeout_hook; #endif #endif /* _NF_CONNTRACK_TIMEOUT_H */ |
203 10 462 847 840 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Internal procfs definitions * * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/proc_fs.h> #include <linux/proc_ns.h> #include <linux/refcount.h> #include <linux/spinlock.h> #include <linux/atomic.h> #include <linux/binfmts.h> #include <linux/sched/coredump.h> #include <linux/sched/task.h> struct ctl_table_header; struct mempolicy; /* * This is not completely implemented yet. The idea is to * create an in-memory tree (like the actual /proc filesystem * tree) of these proc_dir_entries, so that we can dynamically * add new files to /proc. * * parent/subdir are used for the directory structure (every /proc file has a * parent, but "subdir" is empty for all non-directory entries). * subdir_node is used to build the rb tree "subdir" of the parent. */ struct proc_dir_entry { /* * number of callers into module in progress; * negative -> it's going away RSN */ atomic_t in_use; refcount_t refcnt; struct list_head pde_openers; /* who did ->open, but not ->release */ /* protects ->pde_openers and all struct pde_opener instances */ spinlock_t pde_unload_lock; struct completion *pde_unload_completion; const struct inode_operations *proc_iops; union { const struct proc_ops *proc_ops; const struct file_operations *proc_dir_ops; }; const struct dentry_operations *proc_dops; union { const struct seq_operations *seq_ops; int (*single_show)(struct seq_file *, void *); }; proc_write_t write; void *data; unsigned int state_size; unsigned int low_ino; nlink_t nlink; kuid_t uid; kgid_t gid; loff_t size; struct proc_dir_entry *parent; struct rb_root subdir; struct rb_node subdir_node; char *name; umode_t mode; u8 flags; u8 namelen; char inline_name[]; } __randomize_layout; #define SIZEOF_PDE ( \ sizeof(struct proc_dir_entry) < 128 ? 128 : \ sizeof(struct proc_dir_entry) < 192 ? 192 : \ sizeof(struct proc_dir_entry) < 256 ? 256 : \ sizeof(struct proc_dir_entry) < 512 ? 512 : \ 0) #define SIZEOF_PDE_INLINE_NAME (SIZEOF_PDE - sizeof(struct proc_dir_entry)) static inline bool pde_is_permanent(const struct proc_dir_entry *pde) { return pde->flags & PROC_ENTRY_PERMANENT; } static inline void pde_make_permanent(struct proc_dir_entry *pde) { pde->flags |= PROC_ENTRY_PERMANENT; } extern struct kmem_cache *proc_dir_entry_cache; void pde_free(struct proc_dir_entry *pde); union proc_op { int (*proc_get_link)(struct dentry *, struct path *); int (*proc_show)(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task); const char *lsm; }; struct proc_inode { struct pid *pid; unsigned int fd; union proc_op op; struct proc_dir_entry *pde; struct ctl_table_header *sysctl; struct ctl_table *sysctl_entry; struct hlist_node sibling_inodes; const struct proc_ns_operations *ns_ops; struct inode vfs_inode; } __randomize_layout; /* * General functions */ static inline struct proc_inode *PROC_I(const struct inode *inode) { return container_of(inode, struct proc_inode, vfs_inode); } static inline struct proc_dir_entry *PDE(const struct inode *inode) { return PROC_I(inode)->pde; } static inline struct pid *proc_pid(const struct inode *inode) { return PROC_I(inode)->pid; } static inline struct task_struct *get_proc_task(const struct inode *inode) { return get_pid_task(proc_pid(inode), PIDTYPE_PID); } void task_dump_owner(struct task_struct *task, umode_t mode, kuid_t *ruid, kgid_t *rgid); unsigned name_to_int(const struct qstr *qstr); /* * Offset of the first process in the /proc root directory.. */ #define FIRST_PROCESS_ENTRY 256 /* Worst case buffer size needed for holding an integer. */ #define PROC_NUMBUF 13 /* * array.c */ extern const struct file_operations proc_tid_children_operations; extern void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape); extern int proc_tid_stat(struct seq_file *, struct pid_namespace *, struct pid *, struct task_struct *); extern int proc_tgid_stat(struct seq_file *, struct pid_namespace *, struct pid *, struct task_struct *); extern int proc_pid_status(struct seq_file *, struct pid_namespace *, struct pid *, struct task_struct *); extern int proc_pid_statm(struct seq_file *, struct pid_namespace *, struct pid *, struct task_struct *); /* * base.c */ extern const struct dentry_operations pid_dentry_operations; extern int pid_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); extern int proc_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); extern void proc_pid_evict_inode(struct proc_inode *); extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t); extern void pid_update_inode(struct task_struct *, struct inode *); extern int pid_delete_dentry(const struct dentry *); extern int proc_pid_readdir(struct file *, struct dir_context *); struct dentry *proc_pid_lookup(struct dentry *, unsigned int); extern loff_t mem_lseek(struct file *, loff_t, int); /* Lookups */ typedef struct dentry *instantiate_t(struct dentry *, struct task_struct *, const void *); bool proc_fill_cache(struct file *, struct dir_context *, const char *, unsigned int, instantiate_t, struct task_struct *, const void *); /* * generic.c */ struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode, struct proc_dir_entry **parent, void *data); struct proc_dir_entry *proc_register(struct proc_dir_entry *dir, struct proc_dir_entry *dp); extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int); struct dentry *proc_lookup_de(struct inode *, struct dentry *, struct proc_dir_entry *); extern int proc_readdir(struct file *, struct dir_context *); int proc_readdir_de(struct file *, struct dir_context *, struct proc_dir_entry *); static inline void pde_get(struct proc_dir_entry *pde) { refcount_inc(&pde->refcnt); } extern void pde_put(struct proc_dir_entry *); static inline bool is_empty_pde(const struct proc_dir_entry *pde) { return S_ISDIR(pde->mode) && !pde->proc_iops; } extern ssize_t proc_simple_write(struct file *, const char __user *, size_t, loff_t *); /* * inode.c */ struct pde_opener { struct list_head lh; struct file *file; bool closing; struct completion *c; } __randomize_layout; extern const struct inode_operations proc_link_inode_operations; extern const struct inode_operations proc_pid_link_inode_operations; extern const struct super_operations proc_sops; void proc_init_kmemcache(void); void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock); void set_proc_pid_nlink(void); extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); extern void proc_entry_rundown(struct proc_dir_entry *); /* * proc_namespaces.c */ extern const struct inode_operations proc_ns_dir_inode_operations; extern const struct file_operations proc_ns_dir_operations; /* * proc_net.c */ extern const struct file_operations proc_net_operations; extern const struct inode_operations proc_net_inode_operations; #ifdef CONFIG_NET extern int proc_net_init(void); #else static inline int proc_net_init(void) { return 0; } #endif /* * proc_self.c */ extern int proc_setup_self(struct super_block *); /* * proc_thread_self.c */ extern int proc_setup_thread_self(struct super_block *); extern void proc_thread_self_init(void); /* * proc_sysctl.c */ #ifdef CONFIG_PROC_SYSCTL extern int proc_sys_init(void); extern void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head); #else static inline void proc_sys_init(void) { } static inline void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head) { } #endif /* * proc_tty.c */ #ifdef CONFIG_TTY extern void proc_tty_init(void); #else static inline void proc_tty_init(void) {} #endif /* * root.c */ extern struct proc_dir_entry proc_root; extern void proc_self_init(void); /* * task_[no]mmu.c */ struct mem_size_stats; struct proc_maps_private { struct inode *inode; struct task_struct *task; struct mm_struct *mm; struct vma_iterator iter; #ifdef CONFIG_NUMA struct mempolicy *task_mempolicy; #endif } __randomize_layout; struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode); extern const struct file_operations proc_pid_maps_operations; extern const struct file_operations proc_pid_numa_maps_operations; extern const struct file_operations proc_pid_smaps_operations; extern const struct file_operations proc_pid_smaps_rollup_operations; extern const struct file_operations proc_clear_refs_operations; extern const struct file_operations proc_pagemap_operations; extern unsigned long task_vsize(struct mm_struct *); extern unsigned long task_statm(struct mm_struct *, unsigned long *, unsigned long *, unsigned long *, unsigned long *); extern void task_mem(struct seq_file *, struct mm_struct *); extern const struct dentry_operations proc_net_dentry_ops; static inline void pde_force_lookup(struct proc_dir_entry *pde) { /* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */ pde->proc_dops = &proc_net_dentry_ops; } |
3252 3256 5 3258 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_VIRTIO_RING_H #define _LINUX_VIRTIO_RING_H #include <asm/barrier.h> #include <linux/irqreturn.h> #include <uapi/linux/virtio_ring.h> /* * Barriers in virtio are tricky. Non-SMP virtio guests can't assume * they're not on an SMP host system, so they need to assume real * barriers. Non-SMP virtio hosts could skip the barriers, but does * anyone care? * * For virtio_pci on SMP, we don't need to order with respect to MMIO * accesses through relaxed memory I/O windows, so virt_mb() et al are * sufficient. * * For using virtio to talk to real devices (eg. other heterogeneous * CPUs) we do need real barriers. In theory, we could be using both * kinds of virtio, so it's a runtime decision, and the branch is * actually quite cheap. */ static inline void virtio_mb(bool weak_barriers) { if (weak_barriers) virt_mb(); else mb(); } static inline void virtio_rmb(bool weak_barriers) { if (weak_barriers) virt_rmb(); else dma_rmb(); } static inline void virtio_wmb(bool weak_barriers) { if (weak_barriers) virt_wmb(); else dma_wmb(); } #define virtio_store_mb(weak_barriers, p, v) \ do { \ if (weak_barriers) { \ virt_store_mb(*p, v); \ } else { \ WRITE_ONCE(*p, v); \ mb(); \ } \ } while (0) \ struct virtio_device; struct virtqueue; struct device; /* * Creates a virtqueue and allocates the descriptor ring. If * may_reduce_num is set, then this may allocate a smaller ring than * expected. The caller should query virtqueue_get_vring_size to learn * the actual size of the ring. */ struct virtqueue *vring_create_virtqueue(unsigned int index, unsigned int num, unsigned int vring_align, struct virtio_device *vdev, bool weak_barriers, bool may_reduce_num, bool ctx, bool (*notify)(struct virtqueue *vq), void (*callback)(struct virtqueue *vq), const char *name); /* * Creates a virtqueue and allocates the descriptor ring with per * virtqueue DMA device. */ struct virtqueue *vring_create_virtqueue_dma(unsigned int index, unsigned int num, unsigned int vring_align, struct virtio_device *vdev, bool weak_barriers, bool may_reduce_num, bool ctx, bool (*notify)(struct virtqueue *vq), void (*callback)(struct virtqueue *vq), const char *name, struct device *dma_dev); /* * Creates a virtqueue with a standard layout but a caller-allocated * ring. */ struct virtqueue *vring_new_virtqueue(unsigned int index, unsigned int num, unsigned int vring_align, struct virtio_device *vdev, bool weak_barriers, bool ctx, void *pages, bool (*notify)(struct virtqueue *vq), void (*callback)(struct virtqueue *vq), const char *name); /* * Destroys a virtqueue. If created with vring_create_virtqueue, this * also frees the ring. */ void vring_del_virtqueue(struct virtqueue *vq); /* Filter out transport-specific feature bits. */ void vring_transport_features(struct virtio_device *vdev); irqreturn_t vring_interrupt(int irq, void *_vq); u32 vring_notification_data(struct virtqueue *_vq); #endif /* _LINUX_VIRTIO_RING_H */ |
439 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 | // SPDX-License-Identifier: GPL-2.0 /* * This is a maximally equidistributed combined Tausworthe generator * based on code from GNU Scientific Library 1.5 (30 Jun 2004) * * lfsr113 version: * * x_n = (s1_n ^ s2_n ^ s3_n ^ s4_n) * * s1_{n+1} = (((s1_n & 4294967294) << 18) ^ (((s1_n << 6) ^ s1_n) >> 13)) * s2_{n+1} = (((s2_n & 4294967288) << 2) ^ (((s2_n << 2) ^ s2_n) >> 27)) * s3_{n+1} = (((s3_n & 4294967280) << 7) ^ (((s3_n << 13) ^ s3_n) >> 21)) * s4_{n+1} = (((s4_n & 4294967168) << 13) ^ (((s4_n << 3) ^ s4_n) >> 12)) * * The period of this generator is about 2^113 (see erratum paper). * * From: P. L'Ecuyer, "Maximally Equidistributed Combined Tausworthe * Generators", Mathematics of Computation, 65, 213 (1996), 203--213: * http://www.iro.umontreal.ca/~lecuyer/myftp/papers/tausme.ps * ftp://ftp.iro.umontreal.ca/pub/simulation/lecuyer/papers/tausme.ps * * There is an erratum in the paper "Tables of Maximally Equidistributed * Combined LFSR Generators", Mathematics of Computation, 68, 225 (1999), * 261--269: http://www.iro.umontreal.ca/~lecuyer/myftp/papers/tausme2.ps * * ... the k_j most significant bits of z_j must be non-zero, * for each j. (Note: this restriction also applies to the * computer code given in [4], but was mistakenly not mentioned * in that paper.) * * This affects the seeding procedure by imposing the requirement * s1 > 1, s2 > 7, s3 > 15, s4 > 127. */ #include <linux/types.h> #include <linux/percpu.h> #include <linux/export.h> #include <linux/jiffies.h> #include <linux/random.h> #include <linux/sched.h> #include <linux/bitops.h> #include <linux/slab.h> #include <asm/unaligned.h> /** * prandom_u32_state - seeded pseudo-random number generator. * @state: pointer to state structure holding seeded state. * * This is used for pseudo-randomness with no outside seeding. * For more random results, use get_random_u32(). */ u32 prandom_u32_state(struct rnd_state *state) { #define TAUSWORTHE(s, a, b, c, d) ((s & c) << d) ^ (((s << a) ^ s) >> b) state->s1 = TAUSWORTHE(state->s1, 6U, 13U, 4294967294U, 18U); state->s2 = TAUSWORTHE(state->s2, 2U, 27U, 4294967288U, 2U); state->s3 = TAUSWORTHE(state->s3, 13U, 21U, 4294967280U, 7U); state->s4 = TAUSWORTHE(state->s4, 3U, 12U, 4294967168U, 13U); return (state->s1 ^ state->s2 ^ state->s3 ^ state->s4); } EXPORT_SYMBOL(prandom_u32_state); /** * prandom_bytes_state - get the requested number of pseudo-random bytes * * @state: pointer to state structure holding seeded state. * @buf: where to copy the pseudo-random bytes to * @bytes: the requested number of bytes * * This is used for pseudo-randomness with no outside seeding. * For more random results, use get_random_bytes(). */ void prandom_bytes_state(struct rnd_state *state, void *buf, size_t bytes) { u8 *ptr = buf; while (bytes >= sizeof(u32)) { put_unaligned(prandom_u32_state(state), (u32 *) ptr); ptr += sizeof(u32); bytes -= sizeof(u32); } if (bytes > 0) { u32 rem = prandom_u32_state(state); do { *ptr++ = (u8) rem; bytes--; rem >>= BITS_PER_BYTE; } while (bytes > 0); } } EXPORT_SYMBOL(prandom_bytes_state); static void prandom_warmup(struct rnd_state *state) { /* Calling RNG ten times to satisfy recurrence condition */ prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); } void prandom_seed_full_state(struct rnd_state __percpu *pcpu_state) { int i; for_each_possible_cpu(i) { struct rnd_state *state = per_cpu_ptr(pcpu_state, i); u32 seeds[4]; get_random_bytes(&seeds, sizeof(seeds)); state->s1 = __seed(seeds[0], 2U); state->s2 = __seed(seeds[1], 8U); state->s3 = __seed(seeds[2], 16U); state->s4 = __seed(seeds[3], 128U); prandom_warmup(state); } } EXPORT_SYMBOL(prandom_seed_full_state); #ifdef CONFIG_RANDOM32_SELFTEST static struct prandom_test1 { u32 seed; u32 result; } test1[] = { { 1U, 3484351685U }, { 2U, 2623130059U }, { 3U, 3125133893U }, { 4U, 984847254U }, }; static struct prandom_test2 { u32 seed; u32 iteration; u32 result; } test2[] = { /* Test cases against taus113 from GSL library. */ { 931557656U, 959U, 2975593782U }, { 1339693295U, 876U, 3887776532U }, { 1545556285U, 961U, 1615538833U }, { 601730776U, 723U, 1776162651U }, { 1027516047U, 687U, 511983079U }, { 416526298U, 700U, 916156552U }, { 1395522032U, 652U, 2222063676U }, { 366221443U, 617U, 2992857763U }, { 1539836965U, 714U, 3783265725U }, { 556206671U, 994U, 799626459U }, { 684907218U, 799U, 367789491U }, { 2121230701U, 931U, 2115467001U }, { 1668516451U, 644U, 3620590685U }, { 768046066U, 883U, 2034077390U }, { 1989159136U, 833U, 1195767305U }, { 536585145U, 996U, 3577259204U }, { 1008129373U, 642U, 1478080776U }, { 1740775604U, 939U, 1264980372U }, { 1967883163U, 508U, 10734624U }, { 1923019697U, 730U, 3821419629U }, { 442079932U, 560U, 3440032343U }, { 1961302714U, 845U, 841962572U }, { 2030205964U, 962U, 1325144227U }, { 1160407529U, 507U, 240940858U }, { 635482502U, 779U, 4200489746U }, { 1252788931U, 699U, 867195434U }, { 1961817131U, 719U, 668237657U }, { 1071468216U, 983U, 917876630U }, { 1281848367U, 932U, 1003100039U }, { 582537119U, 780U, 1127273778U }, { 1973672777U, 853U, 1071368872U }, { 1896756996U, 762U, 1127851055U }, { 847917054U, 500U, 1717499075U }, { 1240520510U, 951U, 2849576657U }, { 1685071682U, 567U, 1961810396U }, { 1516232129U, 557U, 3173877U }, { 1208118903U, 612U, 1613145022U }, { 1817269927U, 693U, 4279122573U }, { 1510091701U, 717U, 638191229U }, { 365916850U, 807U, 600424314U }, { 399324359U, 702U, 1803598116U }, { 1318480274U, 779U, 2074237022U }, { 697758115U, 840U, 1483639402U }, { 1696507773U, 840U, 577415447U }, { 2081979121U, 981U, 3041486449U }, { 955646687U, 742U, 3846494357U }, { 1250683506U, 749U, 836419859U }, { 595003102U, 534U, 366794109U }, { 47485338U, 558U, 3521120834U }, { 619433479U, 610U, 3991783875U }, { 704096520U, 518U, 4139493852U }, { 1712224984U, 606U, 2393312003U }, { 1318233152U, 922U, 3880361134U }, { 855572992U, 761U, 1472974787U }, { 64721421U, 703U, 683860550U }, { 678931758U, 840U, 380616043U }, { 692711973U, 778U, 1382361947U }, { 677703619U, 530U, 2826914161U }, { 92393223U, 586U, 1522128471U }, { 1222592920U, 743U, 3466726667U }, { 358288986U, 695U, 1091956998U }, { 1935056945U, 958U, 514864477U }, { 735675993U, 990U, 1294239989U }, { 1560089402U, 897U, 2238551287U }, { 70616361U, 829U, 22483098U }, { 368234700U, 731U, 2913875084U }, { 20221190U, 879U, 1564152970U }, { 539444654U, 682U, 1835141259U }, { 1314987297U, 840U, 1801114136U }, { 2019295544U, 645U, 3286438930U }, { 469023838U, 716U, 1637918202U }, { 1843754496U, 653U, 2562092152U }, { 400672036U, 809U, 4264212785U }, { 404722249U, 965U, 2704116999U }, { 600702209U, 758U, 584979986U }, { 519953954U, 667U, 2574436237U }, { 1658071126U, 694U, 2214569490U }, { 420480037U, 749U, 3430010866U }, { 690103647U, 969U, 3700758083U }, { 1029424799U, 937U, 3787746841U }, { 2012608669U, 506U, 3362628973U }, { 1535432887U, 998U, 42610943U }, { 1330635533U, 857U, 3040806504U }, { 1223800550U, 539U, 3954229517U }, { 1322411537U, 680U, 3223250324U }, { 1877847898U, 945U, 2915147143U }, { 1646356099U, 874U, 965988280U }, { 805687536U, 744U, 4032277920U }, { 1948093210U, 633U, 1346597684U }, { 392609744U, 783U, 1636083295U }, { 690241304U, 770U, 1201031298U }, { 1360302965U, 696U, 1665394461U }, { 1220090946U, 780U, 1316922812U }, { 447092251U, 500U, 3438743375U }, { 1613868791U, 592U, 828546883U }, { 523430951U, 548U, 2552392304U }, { 726692899U, 810U, 1656872867U }, { 1364340021U, 836U, 3710513486U }, { 1986257729U, 931U, 935013962U }, { 407983964U, 921U, 728767059U }, }; static void prandom_state_selftest_seed(struct rnd_state *state, u32 seed) { #define LCG(x) ((x) * 69069U) /* super-duper LCG */ state->s1 = __seed(LCG(seed), 2U); state->s2 = __seed(LCG(state->s1), 8U); state->s3 = __seed(LCG(state->s2), 16U); state->s4 = __seed(LCG(state->s3), 128U); } static int __init prandom_state_selftest(void) { int i, j, errors = 0, runs = 0; bool error = false; for (i = 0; i < ARRAY_SIZE(test1); i++) { struct rnd_state state; prandom_state_selftest_seed(&state, test1[i].seed); prandom_warmup(&state); if (test1[i].result != prandom_u32_state(&state)) error = true; } if (error) pr_warn("prandom: seed boundary self test failed\n"); else pr_info("prandom: seed boundary self test passed\n"); for (i = 0; i < ARRAY_SIZE(test2); i++) { struct rnd_state state; prandom_state_selftest_seed(&state, test2[i].seed); prandom_warmup(&state); for (j = 0; j < test2[i].iteration - 1; j++) prandom_u32_state(&state); if (test2[i].result != prandom_u32_state(&state)) errors++; runs++; cond_resched(); } if (errors) pr_warn("prandom: %d/%d self tests failed\n", errors, runs); else pr_info("prandom: %d self tests passed\n", runs); return 0; } core_initcall(prandom_state_selftest); #endif |
1401 1402 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 | /* SPDX-License-Identifier: GPL-2.0+ */ /* * Driver for 8250/16550-type serial ports * * Based on drivers/char/serial.c, by Linus Torvalds, Theodore Ts'o. * * Copyright (C) 2001 Russell King. */ #include <linux/bits.h> #include <linux/serial_8250.h> #include <linux/serial_reg.h> #include <linux/dmaengine.h> #include "../serial_mctrl_gpio.h" struct uart_8250_dma { int (*tx_dma)(struct uart_8250_port *p); int (*rx_dma)(struct uart_8250_port *p); void (*prepare_tx_dma)(struct uart_8250_port *p); void (*prepare_rx_dma)(struct uart_8250_port *p); /* Filter function */ dma_filter_fn fn; /* Parameter to the filter function */ void *rx_param; void *tx_param; struct dma_slave_config rxconf; struct dma_slave_config txconf; struct dma_chan *rxchan; struct dma_chan *txchan; /* Device address base for DMA operations */ phys_addr_t rx_dma_addr; phys_addr_t tx_dma_addr; /* DMA address of the buffer in memory */ dma_addr_t rx_addr; dma_addr_t tx_addr; dma_cookie_t rx_cookie; dma_cookie_t tx_cookie; void *rx_buf; size_t rx_size; size_t tx_size; unsigned char tx_running; unsigned char tx_err; unsigned char rx_running; }; struct old_serial_port { unsigned int uart; unsigned int baud_base; unsigned int port; unsigned int irq; upf_t flags; unsigned char io_type; unsigned char __iomem *iomem_base; unsigned short iomem_reg_shift; }; struct serial8250_config { const char *name; unsigned short fifo_size; unsigned short tx_loadsz; unsigned char fcr; unsigned char rxtrig_bytes[UART_FCR_R_TRIG_MAX_STATE]; unsigned int flags; }; #define UART_CAP_FIFO BIT(8) /* UART has FIFO */ #define UART_CAP_EFR BIT(9) /* UART has EFR */ #define UART_CAP_SLEEP BIT(10) /* UART has IER sleep */ #define UART_CAP_AFE BIT(11) /* MCR-based hw flow control */ #define UART_CAP_UUE BIT(12) /* UART needs IER bit 6 set (Xscale) */ #define UART_CAP_RTOIE BIT(13) /* UART needs IER bit 4 set (Xscale, Tegra) */ #define UART_CAP_HFIFO BIT(14) /* UART has a "hidden" FIFO */ #define UART_CAP_RPM BIT(15) /* Runtime PM is active while idle */ #define UART_CAP_IRDA BIT(16) /* UART supports IrDA line discipline */ #define UART_CAP_MINI BIT(17) /* Mini UART on BCM283X family lacks: * STOP PARITY EPAR SPAR WLEN5 WLEN6 */ #define UART_CAP_NOTEMT BIT(18) /* UART without interrupt on TEMT available */ #define UART_BUG_QUOT BIT(0) /* UART has buggy quot LSB */ #define UART_BUG_TXEN BIT(1) /* UART has buggy TX IIR status */ #define UART_BUG_NOMSR BIT(2) /* UART has buggy MSR status bits (Au1x00) */ #define UART_BUG_THRE BIT(3) /* UART has buggy THRE reassertion */ #define UART_BUG_TXRACE BIT(5) /* UART Tx fails to set remote DR */ #ifdef CONFIG_SERIAL_8250_SHARE_IRQ #define SERIAL8250_SHARE_IRQS 1 #else #define SERIAL8250_SHARE_IRQS 0 #endif #define SERIAL8250_PORT_FLAGS(_base, _irq, _flags) \ { \ .iobase = _base, \ .irq = _irq, \ .uartclk = 1843200, \ .iotype = UPIO_PORT, \ .flags = UPF_BOOT_AUTOCONF | (_flags), \ } #define SERIAL8250_PORT(_base, _irq) SERIAL8250_PORT_FLAGS(_base, _irq, 0) static inline int serial_in(struct uart_8250_port *up, int offset) { return up->port.serial_in(&up->port, offset); } static inline void serial_out(struct uart_8250_port *up, int offset, int value) { up->port.serial_out(&up->port, offset, value); } /** * serial_lsr_in - Read LSR register and preserve flags across reads * @up: uart 8250 port * * Read LSR register and handle saving non-preserved flags across reads. * The flags that are not preserved across reads are stored into * up->lsr_saved_flags. * * Returns LSR value or'ed with the preserved flags (if any). */ static inline u16 serial_lsr_in(struct uart_8250_port *up) { u16 lsr = up->lsr_saved_flags; lsr |= serial_in(up, UART_LSR); up->lsr_saved_flags = lsr & up->lsr_save_mask; return lsr; } /* * For the 16C950 */ static void serial_icr_write(struct uart_8250_port *up, int offset, int value) { serial_out(up, UART_SCR, offset); serial_out(up, UART_ICR, value); } static unsigned int __maybe_unused serial_icr_read(struct uart_8250_port *up, int offset) { unsigned int value; serial_icr_write(up, UART_ACR, up->acr | UART_ACR_ICRRD); serial_out(up, UART_SCR, offset); value = serial_in(up, UART_ICR); serial_icr_write(up, UART_ACR, up->acr); return value; } void serial8250_clear_and_reinit_fifos(struct uart_8250_port *p); static inline u32 serial_dl_read(struct uart_8250_port *up) { return up->dl_read(up); } static inline void serial_dl_write(struct uart_8250_port *up, u32 value) { up->dl_write(up, value); } static inline bool serial8250_set_THRI(struct uart_8250_port *up) { /* Port locked to synchronize UART_IER access against the console. */ lockdep_assert_held_once(&up->port.lock); if (up->ier & UART_IER_THRI) return false; up->ier |= UART_IER_THRI; serial_out(up, UART_IER, up->ier); return true; } static inline bool serial8250_clear_THRI(struct uart_8250_port *up) { /* Port locked to synchronize UART_IER access against the console. */ lockdep_assert_held_once(&up->port.lock); if (!(up->ier & UART_IER_THRI)) return false; up->ier &= ~UART_IER_THRI; serial_out(up, UART_IER, up->ier); return true; } struct uart_8250_port *serial8250_get_port(int line); void serial8250_rpm_get(struct uart_8250_port *p); void serial8250_rpm_put(struct uart_8250_port *p); void serial8250_rpm_get_tx(struct uart_8250_port *p); void serial8250_rpm_put_tx(struct uart_8250_port *p); int serial8250_em485_config(struct uart_port *port, struct ktermios *termios, struct serial_rs485 *rs485); void serial8250_em485_start_tx(struct uart_8250_port *p); void serial8250_em485_stop_tx(struct uart_8250_port *p); void serial8250_em485_destroy(struct uart_8250_port *p); extern struct serial_rs485 serial8250_em485_supported; /* MCR <-> TIOCM conversion */ static inline int serial8250_TIOCM_to_MCR(int tiocm) { int mcr = 0; if (tiocm & TIOCM_RTS) mcr |= UART_MCR_RTS; if (tiocm & TIOCM_DTR) mcr |= UART_MCR_DTR; if (tiocm & TIOCM_OUT1) mcr |= UART_MCR_OUT1; if (tiocm & TIOCM_OUT2) mcr |= UART_MCR_OUT2; if (tiocm & TIOCM_LOOP) mcr |= UART_MCR_LOOP; return mcr; } static inline int serial8250_MCR_to_TIOCM(int mcr) { int tiocm = 0; if (mcr & UART_MCR_RTS) tiocm |= TIOCM_RTS; if (mcr & UART_MCR_DTR) tiocm |= TIOCM_DTR; if (mcr & UART_MCR_OUT1) tiocm |= TIOCM_OUT1; if (mcr & UART_MCR_OUT2) tiocm |= TIOCM_OUT2; if (mcr & UART_MCR_LOOP) tiocm |= TIOCM_LOOP; return tiocm; } /* MSR <-> TIOCM conversion */ static inline int serial8250_MSR_to_TIOCM(int msr) { int tiocm = 0; if (msr & UART_MSR_DCD) tiocm |= TIOCM_CAR; if (msr & UART_MSR_RI) tiocm |= TIOCM_RNG; if (msr & UART_MSR_DSR) tiocm |= TIOCM_DSR; if (msr & UART_MSR_CTS) tiocm |= TIOCM_CTS; return tiocm; } static inline void serial8250_out_MCR(struct uart_8250_port *up, int value) { serial_out(up, UART_MCR, value); if (up->gpios) mctrl_gpio_set(up->gpios, serial8250_MCR_to_TIOCM(value)); } static inline int serial8250_in_MCR(struct uart_8250_port *up) { int mctrl; mctrl = serial_in(up, UART_MCR); if (up->gpios) { unsigned int mctrl_gpio = 0; mctrl_gpio = mctrl_gpio_get_outputs(up->gpios, &mctrl_gpio); mctrl |= serial8250_TIOCM_to_MCR(mctrl_gpio); } return mctrl; } bool alpha_jensen(void); void alpha_jensen_set_mctrl(struct uart_port *port, unsigned int mctrl); #ifdef CONFIG_SERIAL_8250_PNP int serial8250_pnp_init(void); void serial8250_pnp_exit(void); #else static inline int serial8250_pnp_init(void) { return 0; } static inline void serial8250_pnp_exit(void) { } #endif #ifdef CONFIG_SERIAL_8250_FINTEK int fintek_8250_probe(struct uart_8250_port *uart); #else static inline int fintek_8250_probe(struct uart_8250_port *uart) { return 0; } #endif #ifdef CONFIG_ARCH_OMAP1 #include <linux/soc/ti/omap1-soc.h> static inline int is_omap1_8250(struct uart_8250_port *pt) { int res; switch (pt->port.mapbase) { case OMAP1_UART1_BASE: case OMAP1_UART2_BASE: case OMAP1_UART3_BASE: res = 1; break; default: res = 0; break; } return res; } static inline int is_omap1510_8250(struct uart_8250_port *pt) { if (!cpu_is_omap1510()) return 0; return is_omap1_8250(pt); } #else static inline int is_omap1_8250(struct uart_8250_port *pt) { return 0; } static inline int is_omap1510_8250(struct uart_8250_port *pt) { return 0; } #endif #ifdef CONFIG_SERIAL_8250_DMA extern int serial8250_tx_dma(struct uart_8250_port *); extern int serial8250_rx_dma(struct uart_8250_port *); extern void serial8250_rx_dma_flush(struct uart_8250_port *); extern int serial8250_request_dma(struct uart_8250_port *); extern void serial8250_release_dma(struct uart_8250_port *); static inline void serial8250_do_prepare_tx_dma(struct uart_8250_port *p) { struct uart_8250_dma *dma = p->dma; if (dma->prepare_tx_dma) dma->prepare_tx_dma(p); } static inline void serial8250_do_prepare_rx_dma(struct uart_8250_port *p) { struct uart_8250_dma *dma = p->dma; if (dma->prepare_rx_dma) dma->prepare_rx_dma(p); } static inline bool serial8250_tx_dma_running(struct uart_8250_port *p) { struct uart_8250_dma *dma = p->dma; return dma && dma->tx_running; } #else static inline int serial8250_tx_dma(struct uart_8250_port *p) { return -1; } static inline int serial8250_rx_dma(struct uart_8250_port *p) { return -1; } static inline void serial8250_rx_dma_flush(struct uart_8250_port *p) { } static inline int serial8250_request_dma(struct uart_8250_port *p) { return -1; } static inline void serial8250_release_dma(struct uart_8250_port *p) { } static inline bool serial8250_tx_dma_running(struct uart_8250_port *p) { return false; } #endif static inline int ns16550a_goto_highspeed(struct uart_8250_port *up) { unsigned char status; status = serial_in(up, 0x04); /* EXCR2 */ #define PRESL(x) ((x) & 0x30) if (PRESL(status) == 0x10) { /* already in high speed mode */ return 0; } else { status &= ~0xB0; /* Disable LOCK, mask out PRESL[01] */ status |= 0x10; /* 1.625 divisor for baud_base --> 921600 */ serial_out(up, 0x04, status); } return 1; } static inline int serial_index(struct uart_port *port) { return port->minor - 64; } |
411 49 385 328 1 5 5 31 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_FIB_RULES_H #define __NET_FIB_RULES_H #include <linux/types.h> #include <linux/slab.h> #include <linux/netdevice.h> #include <linux/fib_rules.h> #include <linux/refcount.h> #include <net/flow.h> #include <net/rtnetlink.h> #include <net/fib_notifier.h> #include <linux/indirect_call_wrapper.h> struct fib_kuid_range { kuid_t start; kuid_t end; }; struct fib_rule { struct list_head list; int iifindex; int oifindex; u32 mark; u32 mark_mask; u32 flags; u32 table; u8 action; u8 l3mdev; u8 proto; u8 ip_proto; u32 target; __be64 tun_id; struct fib_rule __rcu *ctarget; struct net *fr_net; refcount_t refcnt; u32 pref; int suppress_ifgroup; int suppress_prefixlen; char iifname[IFNAMSIZ]; char oifname[IFNAMSIZ]; struct fib_kuid_range uid_range; struct fib_rule_port_range sport_range; struct fib_rule_port_range dport_range; struct rcu_head rcu; }; struct fib_lookup_arg { void *lookup_ptr; const void *lookup_data; void *result; struct fib_rule *rule; u32 table; int flags; #define FIB_LOOKUP_NOREF 1 #define FIB_LOOKUP_IGNORE_LINKSTATE 2 }; struct fib_rules_ops { int family; struct list_head list; int rule_size; int addr_size; int unresolved_rules; int nr_goto_rules; unsigned int fib_rules_seq; int (*action)(struct fib_rule *, struct flowi *, int, struct fib_lookup_arg *); bool (*suppress)(struct fib_rule *, int, struct fib_lookup_arg *); int (*match)(struct fib_rule *, struct flowi *, int); int (*configure)(struct fib_rule *, struct sk_buff *, struct fib_rule_hdr *, struct nlattr **, struct netlink_ext_ack *); int (*delete)(struct fib_rule *); int (*compare)(struct fib_rule *, struct fib_rule_hdr *, struct nlattr **); int (*fill)(struct fib_rule *, struct sk_buff *, struct fib_rule_hdr *); size_t (*nlmsg_payload)(struct fib_rule *); /* Called after modifications to the rules set, must flush * the route cache if one exists. */ void (*flush_cache)(struct fib_rules_ops *ops); int nlgroup; struct list_head rules_list; struct module *owner; struct net *fro_net; struct rcu_head rcu; }; struct fib_rule_notifier_info { struct fib_notifier_info info; /* must be first */ struct fib_rule *rule; }; static inline void fib_rule_get(struct fib_rule *rule) { refcount_inc(&rule->refcnt); } static inline void fib_rule_put(struct fib_rule *rule) { if (refcount_dec_and_test(&rule->refcnt)) kfree_rcu(rule, rcu); } #ifdef CONFIG_NET_L3_MASTER_DEV static inline u32 fib_rule_get_table(struct fib_rule *rule, struct fib_lookup_arg *arg) { return rule->l3mdev ? arg->table : rule->table; } #else static inline u32 fib_rule_get_table(struct fib_rule *rule, struct fib_lookup_arg *arg) { return rule->table; } #endif static inline u32 frh_get_table(struct fib_rule_hdr *frh, struct nlattr **nla) { if (nla[FRA_TABLE]) return nla_get_u32(nla[FRA_TABLE]); return frh->table; } static inline bool fib_rule_port_range_set(const struct fib_rule_port_range *range) { return range->start != 0 && range->end != 0; } static inline bool fib_rule_port_inrange(const struct fib_rule_port_range *a, __be16 port) { return ntohs(port) >= a->start && ntohs(port) <= a->end; } static inline bool fib_rule_port_range_valid(const struct fib_rule_port_range *a) { return a->start != 0 && a->end != 0 && a->end < 0xffff && a->start <= a->end; } static inline bool fib_rule_port_range_compare(struct fib_rule_port_range *a, struct fib_rule_port_range *b) { return a->start == b->start && a->end == b->end; } static inline bool fib_rule_requires_fldissect(struct fib_rule *rule) { return rule->iifindex != LOOPBACK_IFINDEX && (rule->ip_proto || fib_rule_port_range_set(&rule->sport_range) || fib_rule_port_range_set(&rule->dport_range)); } struct fib_rules_ops *fib_rules_register(const struct fib_rules_ops *, struct net *); void fib_rules_unregister(struct fib_rules_ops *); int fib_rules_lookup(struct fib_rules_ops *, struct flowi *, int flags, struct fib_lookup_arg *); int fib_default_rule_add(struct fib_rules_ops *, u32 pref, u32 table, u32 flags); bool fib_rule_matchall(const struct fib_rule *rule); int fib_rules_dump(struct net *net, struct notifier_block *nb, int family, struct netlink_ext_ack *extack); unsigned int fib_rules_seq_read(struct net *net, int family); int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack); int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack); INDIRECT_CALLABLE_DECLARE(int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)); INDIRECT_CALLABLE_DECLARE(int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)); INDIRECT_CALLABLE_DECLARE(int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg)); INDIRECT_CALLABLE_DECLARE(int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg)); INDIRECT_CALLABLE_DECLARE(bool fib6_rule_suppress(struct fib_rule *rule, int flags, struct fib_lookup_arg *arg)); INDIRECT_CALLABLE_DECLARE(bool fib4_rule_suppress(struct fib_rule *rule, int flags, struct fib_lookup_arg *arg)); #endif |
343 343 343 29 204 6 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/proc/inode.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/cache.h> #include <linux/time.h> #include <linux/proc_fs.h> #include <linux/kernel.h> #include <linux/pid_namespace.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/stat.h> #include <linux/completion.h> #include <linux/poll.h> #include <linux/printk.h> #include <linux/file.h> #include <linux/limits.h> #include <linux/init.h> #include <linux/module.h> #include <linux/sysctl.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/mount.h> #include <linux/bug.h> #include "internal.h" static void proc_evict_inode(struct inode *inode) { struct proc_dir_entry *de; struct ctl_table_header *head; struct proc_inode *ei = PROC_I(inode); truncate_inode_pages_final(&inode->i_data); clear_inode(inode); /* Stop tracking associated processes */ if (ei->pid) { proc_pid_evict_inode(ei); ei->pid = NULL; } /* Let go of any associated proc directory entry */ de = ei->pde; if (de) { pde_put(de); ei->pde = NULL; } head = ei->sysctl; if (head) { RCU_INIT_POINTER(ei->sysctl, NULL); proc_sys_evict_inode(inode, head); } } static struct kmem_cache *proc_inode_cachep __ro_after_init; static struct kmem_cache *pde_opener_cache __ro_after_init; static struct inode *proc_alloc_inode(struct super_block *sb) { struct proc_inode *ei; ei = alloc_inode_sb(sb, proc_inode_cachep, GFP_KERNEL); if (!ei) return NULL; ei->pid = NULL; ei->fd = 0; ei->op.proc_get_link = NULL; ei->pde = NULL; ei->sysctl = NULL; ei->sysctl_entry = NULL; INIT_HLIST_NODE(&ei->sibling_inodes); ei->ns_ops = NULL; return &ei->vfs_inode; } static void proc_free_inode(struct inode *inode) { kmem_cache_free(proc_inode_cachep, PROC_I(inode)); } static void init_once(void *foo) { struct proc_inode *ei = (struct proc_inode *) foo; inode_init_once(&ei->vfs_inode); } void __init proc_init_kmemcache(void) { proc_inode_cachep = kmem_cache_create("proc_inode_cache", sizeof(struct proc_inode), 0, (SLAB_RECLAIM_ACCOUNT| SLAB_MEM_SPREAD|SLAB_ACCOUNT| SLAB_PANIC), init_once); pde_opener_cache = kmem_cache_create("pde_opener", sizeof(struct pde_opener), 0, SLAB_ACCOUNT|SLAB_PANIC, NULL); proc_dir_entry_cache = kmem_cache_create_usercopy( "proc_dir_entry", SIZEOF_PDE, 0, SLAB_PANIC, offsetof(struct proc_dir_entry, inline_name), SIZEOF_PDE_INLINE_NAME, NULL); BUILD_BUG_ON(sizeof(struct proc_dir_entry) >= SIZEOF_PDE); } void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock) { struct inode *inode; struct proc_inode *ei; struct hlist_node *node; struct super_block *old_sb = NULL; rcu_read_lock(); for (;;) { struct super_block *sb; node = hlist_first_rcu(inodes); if (!node) break; ei = hlist_entry(node, struct proc_inode, sibling_inodes); spin_lock(lock); hlist_del_init_rcu(&ei->sibling_inodes); spin_unlock(lock); inode = &ei->vfs_inode; sb = inode->i_sb; if ((sb != old_sb) && !atomic_inc_not_zero(&sb->s_active)) continue; inode = igrab(inode); rcu_read_unlock(); if (sb != old_sb) { if (old_sb) deactivate_super(old_sb); old_sb = sb; } if (unlikely(!inode)) { rcu_read_lock(); continue; } if (S_ISDIR(inode->i_mode)) { struct dentry *dir = d_find_any_alias(inode); if (dir) { d_invalidate(dir); dput(dir); } } else { struct dentry *dentry; while ((dentry = d_find_alias(inode))) { d_invalidate(dentry); dput(dentry); } } iput(inode); rcu_read_lock(); } rcu_read_unlock(); if (old_sb) deactivate_super(old_sb); } static inline const char *hidepid2str(enum proc_hidepid v) { switch (v) { case HIDEPID_OFF: return "off"; case HIDEPID_NO_ACCESS: return "noaccess"; case HIDEPID_INVISIBLE: return "invisible"; case HIDEPID_NOT_PTRACEABLE: return "ptraceable"; } WARN_ONCE(1, "bad hide_pid value: %d\n", v); return "unknown"; } static int proc_show_options(struct seq_file *seq, struct dentry *root) { struct proc_fs_info *fs_info = proc_sb_info(root->d_sb); if (!gid_eq(fs_info->pid_gid, GLOBAL_ROOT_GID)) seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, fs_info->pid_gid)); if (fs_info->hide_pid != HIDEPID_OFF) seq_printf(seq, ",hidepid=%s", hidepid2str(fs_info->hide_pid)); if (fs_info->pidonly != PROC_PIDONLY_OFF) seq_printf(seq, ",subset=pid"); return 0; } const struct super_operations proc_sops = { .alloc_inode = proc_alloc_inode, .free_inode = proc_free_inode, .drop_inode = generic_delete_inode, .evict_inode = proc_evict_inode, .statfs = simple_statfs, .show_options = proc_show_options, }; enum {BIAS = -1U<<31}; static inline int use_pde(struct proc_dir_entry *pde) { return likely(atomic_inc_unless_negative(&pde->in_use)); } static void unuse_pde(struct proc_dir_entry *pde) { if (unlikely(atomic_dec_return(&pde->in_use) == BIAS)) complete(pde->pde_unload_completion); } /* * At most 2 contexts can enter this function: the one doing the last * close on the descriptor and whoever is deleting PDE itself. * * First to enter calls ->proc_release hook and signals its completion * to the second one which waits and then does nothing. * * PDE is locked on entry, unlocked on exit. */ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) __releases(&pde->pde_unload_lock) { /* * close() (proc_reg_release()) can't delete an entry and proceed: * ->release hook needs to be available at the right moment. * * rmmod (remove_proc_entry() et al) can't delete an entry and proceed: * "struct file" needs to be available at the right moment. */ if (pdeo->closing) { /* somebody else is doing that, just wait */ DECLARE_COMPLETION_ONSTACK(c); pdeo->c = &c; spin_unlock(&pde->pde_unload_lock); wait_for_completion(&c); } else { struct file *file; struct completion *c; pdeo->closing = true; spin_unlock(&pde->pde_unload_lock); file = pdeo->file; pde->proc_ops->proc_release(file_inode(file), file); spin_lock(&pde->pde_unload_lock); /* Strictly after ->proc_release, see above. */ list_del(&pdeo->lh); c = pdeo->c; spin_unlock(&pde->pde_unload_lock); if (unlikely(c)) complete(c); kmem_cache_free(pde_opener_cache, pdeo); } } void proc_entry_rundown(struct proc_dir_entry *de) { DECLARE_COMPLETION_ONSTACK(c); /* Wait until all existing callers into module are done. */ de->pde_unload_completion = &c; if (atomic_add_return(BIAS, &de->in_use) != BIAS) wait_for_completion(&c); /* ->pde_openers list can't grow from now on. */ spin_lock(&de->pde_unload_lock); while (!list_empty(&de->pde_openers)) { struct pde_opener *pdeo; pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh); close_pdeo(de, pdeo); spin_lock(&de->pde_unload_lock); } spin_unlock(&de->pde_unload_lock); } static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence) { struct proc_dir_entry *pde = PDE(file_inode(file)); loff_t rv = -EINVAL; if (pde_is_permanent(pde)) { return pde->proc_ops->proc_lseek(file, offset, whence); } else if (use_pde(pde)) { rv = pde->proc_ops->proc_lseek(file, offset, whence); unuse_pde(pde); } return rv; } static ssize_t proc_reg_read_iter(struct kiocb *iocb, struct iov_iter *iter) { struct proc_dir_entry *pde = PDE(file_inode(iocb->ki_filp)); ssize_t ret; if (pde_is_permanent(pde)) return pde->proc_ops->proc_read_iter(iocb, iter); if (!use_pde(pde)) return -EIO; ret = pde->proc_ops->proc_read_iter(iocb, iter); unuse_pde(pde); return ret; } static ssize_t pde_read(struct proc_dir_entry *pde, struct file *file, char __user *buf, size_t count, loff_t *ppos) { typeof_member(struct proc_ops, proc_read) read; read = pde->proc_ops->proc_read; if (read) return read(file, buf, count, ppos); return -EIO; } static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; if (pde_is_permanent(pde)) { return pde_read(pde, file, buf, count, ppos); } else if (use_pde(pde)) { rv = pde_read(pde, file, buf, count, ppos); unuse_pde(pde); } return rv; } static ssize_t pde_write(struct proc_dir_entry *pde, struct file *file, const char __user *buf, size_t count, loff_t *ppos) { typeof_member(struct proc_ops, proc_write) write; write = pde->proc_ops->proc_write; if (write) return write(file, buf, count, ppos); return -EIO; } static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; if (pde_is_permanent(pde)) { return pde_write(pde, file, buf, count, ppos); } else if (use_pde(pde)) { rv = pde_write(pde, file, buf, count, ppos); unuse_pde(pde); } return rv; } static __poll_t pde_poll(struct proc_dir_entry *pde, struct file *file, struct poll_table_struct *pts) { typeof_member(struct proc_ops, proc_poll) poll; poll = pde->proc_ops->proc_poll; if (poll) return poll(file, pts); return DEFAULT_POLLMASK; } static __poll_t proc_reg_poll(struct file *file, struct poll_table_struct *pts) { struct proc_dir_entry *pde = PDE(file_inode(file)); __poll_t rv = DEFAULT_POLLMASK; if (pde_is_permanent(pde)) { return pde_poll(pde, file, pts); } else if (use_pde(pde)) { rv = pde_poll(pde, file, pts); unuse_pde(pde); } return rv; } static long pde_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg) { typeof_member(struct proc_ops, proc_ioctl) ioctl; ioctl = pde->proc_ops->proc_ioctl; if (ioctl) return ioctl(file, cmd, arg); return -ENOTTY; } static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; if (pde_is_permanent(pde)) { return pde_ioctl(pde, file, cmd, arg); } else if (use_pde(pde)) { rv = pde_ioctl(pde, file, cmd, arg); unuse_pde(pde); } return rv; } #ifdef CONFIG_COMPAT static long pde_compat_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg) { typeof_member(struct proc_ops, proc_compat_ioctl) compat_ioctl; compat_ioctl = pde->proc_ops->proc_compat_ioctl; if (compat_ioctl) return compat_ioctl(file, cmd, arg); return -ENOTTY; } static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; if (pde_is_permanent(pde)) { return pde_compat_ioctl(pde, file, cmd, arg); } else if (use_pde(pde)) { rv = pde_compat_ioctl(pde, file, cmd, arg); unuse_pde(pde); } return rv; } #endif static int pde_mmap(struct proc_dir_entry *pde, struct file *file, struct vm_area_struct *vma) { typeof_member(struct proc_ops, proc_mmap) mmap; mmap = pde->proc_ops->proc_mmap; if (mmap) return mmap(file, vma); return -EIO; } static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma) { struct proc_dir_entry *pde = PDE(file_inode(file)); int rv = -EIO; if (pde_is_permanent(pde)) { return pde_mmap(pde, file, vma); } else if (use_pde(pde)) { rv = pde_mmap(pde, file, vma); unuse_pde(pde); } return rv; } static unsigned long pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags) { typeof_member(struct proc_ops, proc_get_unmapped_area) get_area; get_area = pde->proc_ops->proc_get_unmapped_area; #ifdef CONFIG_MMU if (!get_area) get_area = current->mm->get_unmapped_area; #endif if (get_area) return get_area(file, orig_addr, len, pgoff, flags); return orig_addr; } static unsigned long proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct proc_dir_entry *pde = PDE(file_inode(file)); unsigned long rv = -EIO; if (pde_is_permanent(pde)) { return pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags); } else if (use_pde(pde)) { rv = pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags); unuse_pde(pde); } return rv; } static int proc_reg_open(struct inode *inode, struct file *file) { struct proc_dir_entry *pde = PDE(inode); int rv = 0; typeof_member(struct proc_ops, proc_open) open; typeof_member(struct proc_ops, proc_release) release; struct pde_opener *pdeo; if (!pde->proc_ops->proc_lseek) file->f_mode &= ~FMODE_LSEEK; if (pde_is_permanent(pde)) { open = pde->proc_ops->proc_open; if (open) rv = open(inode, file); return rv; } /* * Ensure that * 1) PDE's ->release hook will be called no matter what * either normally by close()/->release, or forcefully by * rmmod/remove_proc_entry. * * 2) rmmod isn't blocked by opening file in /proc and sitting on * the descriptor (including "rmmod foo </proc/foo" scenario). * * Save every "struct file" with custom ->release hook. */ if (!use_pde(pde)) return -ENOENT; release = pde->proc_ops->proc_release; if (release) { pdeo = kmem_cache_alloc(pde_opener_cache, GFP_KERNEL); if (!pdeo) { rv = -ENOMEM; goto out_unuse; } } open = pde->proc_ops->proc_open; if (open) rv = open(inode, file); if (release) { if (rv == 0) { /* To know what to release. */ pdeo->file = file; pdeo->closing = false; pdeo->c = NULL; spin_lock(&pde->pde_unload_lock); list_add(&pdeo->lh, &pde->pde_openers); spin_unlock(&pde->pde_unload_lock); } else kmem_cache_free(pde_opener_cache, pdeo); } out_unuse: unuse_pde(pde); return rv; } static int proc_reg_release(struct inode *inode, struct file *file) { struct proc_dir_entry *pde = PDE(inode); struct pde_opener *pdeo; if (pde_is_permanent(pde)) { typeof_member(struct proc_ops, proc_release) release; release = pde->proc_ops->proc_release; if (release) { return release(inode, file); } return 0; } spin_lock(&pde->pde_unload_lock); list_for_each_entry(pdeo, &pde->pde_openers, lh) { if (pdeo->file == file) { close_pdeo(pde, pdeo); return 0; } } spin_unlock(&pde->pde_unload_lock); return 0; } static const struct file_operations proc_reg_file_ops = { .llseek = proc_reg_llseek, .read = proc_reg_read, .write = proc_reg_write, .poll = proc_reg_poll, .unlocked_ioctl = proc_reg_unlocked_ioctl, .mmap = proc_reg_mmap, .get_unmapped_area = proc_reg_get_unmapped_area, .open = proc_reg_open, .release = proc_reg_release, }; static const struct file_operations proc_iter_file_ops = { .llseek = proc_reg_llseek, .read_iter = proc_reg_read_iter, .write = proc_reg_write, .splice_read = copy_splice_read, .poll = proc_reg_poll, .unlocked_ioctl = proc_reg_unlocked_ioctl, .mmap = proc_reg_mmap, .get_unmapped_area = proc_reg_get_unmapped_area, .open = proc_reg_open, .release = proc_reg_release, }; #ifdef CONFIG_COMPAT static const struct file_operations proc_reg_file_ops_compat = { .llseek = proc_reg_llseek, .read = proc_reg_read, .write = proc_reg_write, .poll = proc_reg_poll, .unlocked_ioctl = proc_reg_unlocked_ioctl, .compat_ioctl = proc_reg_compat_ioctl, .mmap = proc_reg_mmap, .get_unmapped_area = proc_reg_get_unmapped_area, .open = proc_reg_open, .release = proc_reg_release, }; static const struct file_operations proc_iter_file_ops_compat = { .llseek = proc_reg_llseek, .read_iter = proc_reg_read_iter, .splice_read = copy_splice_read, .write = proc_reg_write, .poll = proc_reg_poll, .unlocked_ioctl = proc_reg_unlocked_ioctl, .compat_ioctl = proc_reg_compat_ioctl, .mmap = proc_reg_mmap, .get_unmapped_area = proc_reg_get_unmapped_area, .open = proc_reg_open, .release = proc_reg_release, }; #endif static void proc_put_link(void *p) { unuse_pde(p); } static const char *proc_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { struct proc_dir_entry *pde = PDE(inode); if (!use_pde(pde)) return ERR_PTR(-EINVAL); set_delayed_call(done, proc_put_link, pde); return pde->data; } const struct inode_operations proc_link_inode_operations = { .get_link = proc_get_link, }; struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) { struct inode *inode = new_inode(sb); if (!inode) { pde_put(de); return NULL; } inode->i_private = de->data; inode->i_ino = de->low_ino; inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); PROC_I(inode)->pde = de; if (is_empty_pde(de)) { make_empty_dir_inode(inode); return inode; } if (de->mode) { inode->i_mode = de->mode; inode->i_uid = de->uid; inode->i_gid = de->gid; } if (de->size) inode->i_size = de->size; if (de->nlink) set_nlink(inode, de->nlink); if (S_ISREG(inode->i_mode)) { inode->i_op = de->proc_iops; if (de->proc_ops->proc_read_iter) inode->i_fop = &proc_iter_file_ops; else inode->i_fop = &proc_reg_file_ops; #ifdef CONFIG_COMPAT if (de->proc_ops->proc_compat_ioctl) { if (de->proc_ops->proc_read_iter) inode->i_fop = &proc_iter_file_ops_compat; else inode->i_fop = &proc_reg_file_ops_compat; } #endif } else if (S_ISDIR(inode->i_mode)) { inode->i_op = de->proc_iops; inode->i_fop = de->proc_dir_ops; } else if (S_ISLNK(inode->i_mode)) { inode->i_op = de->proc_iops; inode->i_fop = NULL; } else { BUG(); } return inode; } |
101 101 100 101 1 1 101 101 104 440 516 523 523 523 523 523 199 199 520 523 523 436 436 523 520 516 6 6 5 101 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 | // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2003, 2004 * * This file is part of the SCTP kernel implementation * * This file contains the code relating the chunk abstraction. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Jon Grimm <jgrimm@us.ibm.com> * Sridhar Samudrala <sri@us.ibm.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/kernel.h> #include <linux/net.h> #include <linux/inet.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/sock.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> /* This file is mostly in anticipation of future work, but initially * populate with fragment tracking for an outbound message. */ /* Initialize datamsg from memory. */ static void sctp_datamsg_init(struct sctp_datamsg *msg) { refcount_set(&msg->refcnt, 1); msg->send_failed = 0; msg->send_error = 0; msg->can_delay = 1; msg->abandoned = 0; msg->expires_at = 0; INIT_LIST_HEAD(&msg->chunks); } /* Allocate and initialize datamsg. */ static struct sctp_datamsg *sctp_datamsg_new(gfp_t gfp) { struct sctp_datamsg *msg; msg = kmalloc(sizeof(struct sctp_datamsg), gfp); if (msg) { sctp_datamsg_init(msg); SCTP_DBG_OBJCNT_INC(datamsg); } return msg; } void sctp_datamsg_free(struct sctp_datamsg *msg) { struct sctp_chunk *chunk; /* This doesn't have to be a _safe vairant because * sctp_chunk_free() only drops the refs. */ list_for_each_entry(chunk, &msg->chunks, frag_list) sctp_chunk_free(chunk); sctp_datamsg_put(msg); } /* Final destructruction of datamsg memory. */ static void sctp_datamsg_destroy(struct sctp_datamsg *msg) { struct sctp_association *asoc = NULL; struct list_head *pos, *temp; struct sctp_chunk *chunk; struct sctp_ulpevent *ev; int error, sent; /* Release all references. */ list_for_each_safe(pos, temp, &msg->chunks) { list_del_init(pos); chunk = list_entry(pos, struct sctp_chunk, frag_list); if (!msg->send_failed) { sctp_chunk_put(chunk); continue; } asoc = chunk->asoc; error = msg->send_error ?: asoc->outqueue.error; sent = chunk->has_tsn ? SCTP_DATA_SENT : SCTP_DATA_UNSENT; if (sctp_ulpevent_type_enabled(asoc->subscribe, SCTP_SEND_FAILED)) { ev = sctp_ulpevent_make_send_failed(asoc, chunk, sent, error, GFP_ATOMIC); if (ev) asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } if (sctp_ulpevent_type_enabled(asoc->subscribe, SCTP_SEND_FAILED_EVENT)) { ev = sctp_ulpevent_make_send_failed_event(asoc, chunk, sent, error, GFP_ATOMIC); if (ev) asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } sctp_chunk_put(chunk); } SCTP_DBG_OBJCNT_DEC(datamsg); kfree(msg); } /* Hold a reference. */ static void sctp_datamsg_hold(struct sctp_datamsg *msg) { refcount_inc(&msg->refcnt); } /* Release a reference. */ void sctp_datamsg_put(struct sctp_datamsg *msg) { if (refcount_dec_and_test(&msg->refcnt)) sctp_datamsg_destroy(msg); } /* Assign a chunk to this datamsg. */ static void sctp_datamsg_assign(struct sctp_datamsg *msg, struct sctp_chunk *chunk) { sctp_datamsg_hold(msg); chunk->msg = msg; } /* A data chunk can have a maximum payload of (2^16 - 20). Break * down any such message into smaller chunks. Opportunistically, fragment * the chunks down to the current MTU constraints. We may get refragmented * later if the PMTU changes, but it is _much better_ to fragment immediately * with a reasonable guess than always doing our fragmentation on the * soft-interrupt. */ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, struct sctp_sndrcvinfo *sinfo, struct iov_iter *from) { size_t len, first_len, max_data, remaining; size_t msg_len = iov_iter_count(from); struct sctp_shared_key *shkey = NULL; struct list_head *pos, *temp; struct sctp_chunk *chunk; struct sctp_datamsg *msg; int err; msg = sctp_datamsg_new(GFP_KERNEL); if (!msg) return ERR_PTR(-ENOMEM); /* Note: Calculate this outside of the loop, so that all fragments * have the same expiration. */ if (asoc->peer.prsctp_capable && sinfo->sinfo_timetolive && (SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags) || !SCTP_PR_POLICY(sinfo->sinfo_flags))) msg->expires_at = jiffies + msecs_to_jiffies(sinfo->sinfo_timetolive); /* This is the biggest possible DATA chunk that can fit into * the packet */ max_data = asoc->frag_point; if (unlikely(!max_data)) { max_data = sctp_min_frag_point(sctp_sk(asoc->base.sk), sctp_datachk_len(&asoc->stream)); pr_warn_ratelimited("%s: asoc:%p frag_point is zero, forcing max_data to default minimum (%zu)", __func__, asoc, max_data); } /* If the peer requested that we authenticate DATA chunks * we need to account for bundling of the AUTH chunks along with * DATA. */ if (sctp_auth_send_cid(SCTP_CID_DATA, asoc)) { struct sctp_hmac *hmac_desc = sctp_auth_asoc_get_hmac(asoc); if (hmac_desc) max_data -= SCTP_PAD4(sizeof(struct sctp_auth_chunk) + hmac_desc->hmac_len); if (sinfo->sinfo_tsn && sinfo->sinfo_ssn != asoc->active_key_id) { shkey = sctp_auth_get_shkey(asoc, sinfo->sinfo_ssn); if (!shkey) { err = -EINVAL; goto errout; } } else { shkey = asoc->shkey; } } /* Set first_len and then account for possible bundles on first frag */ first_len = max_data; /* Check to see if we have a pending SACK and try to let it be bundled * with this message. Do this if we don't have any data queued already. * To check that, look at out_qlen and retransmit list. * NOTE: we will not reduce to account for SACK, if the message would * not have been fragmented. */ if (timer_pending(&asoc->timers[SCTP_EVENT_TIMEOUT_SACK]) && asoc->outqueue.out_qlen == 0 && list_empty(&asoc->outqueue.retransmit) && msg_len > max_data) first_len -= SCTP_PAD4(sizeof(struct sctp_sack_chunk)); /* Encourage Cookie-ECHO bundling. */ if (asoc->state < SCTP_STATE_COOKIE_ECHOED) first_len -= SCTP_ARBITRARY_COOKIE_ECHO_LEN; /* Account for a different sized first fragment */ if (msg_len >= first_len) { msg->can_delay = 0; if (msg_len > first_len) SCTP_INC_STATS(asoc->base.net, SCTP_MIB_FRAGUSRMSGS); } else { /* Which may be the only one... */ first_len = msg_len; } /* Create chunks for all DATA chunks. */ for (remaining = msg_len; remaining; remaining -= len) { u8 frag = SCTP_DATA_MIDDLE_FRAG; if (remaining == msg_len) { /* First frag, which may also be the last */ frag |= SCTP_DATA_FIRST_FRAG; len = first_len; } else { /* Middle frags */ len = max_data; } if (len >= remaining) { /* Last frag, which may also be the first */ len = remaining; frag |= SCTP_DATA_LAST_FRAG; /* The application requests to set the I-bit of the * last DATA chunk of a user message when providing * the user message to the SCTP implementation. */ if ((sinfo->sinfo_flags & SCTP_EOF) || (sinfo->sinfo_flags & SCTP_SACK_IMMEDIATELY)) frag |= SCTP_DATA_SACK_IMM; } chunk = asoc->stream.si->make_datafrag(asoc, sinfo, len, frag, GFP_KERNEL); if (!chunk) { err = -ENOMEM; goto errout; } err = sctp_user_addto_chunk(chunk, len, from); if (err < 0) goto errout_chunk_free; chunk->shkey = shkey; /* Put the chunk->skb back into the form expected by send. */ __skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr - chunk->skb->data); sctp_datamsg_assign(msg, chunk); list_add_tail(&chunk->frag_list, &msg->chunks); } return msg; errout_chunk_free: sctp_chunk_free(chunk); errout: list_for_each_safe(pos, temp, &msg->chunks) { list_del_init(pos); chunk = list_entry(pos, struct sctp_chunk, frag_list); sctp_chunk_free(chunk); } sctp_datamsg_put(msg); return ERR_PTR(err); } /* Check whether this message has expired. */ int sctp_chunk_abandoned(struct sctp_chunk *chunk) { if (!chunk->asoc->peer.prsctp_capable) return 0; if (chunk->msg->abandoned) return 1; if (!chunk->has_tsn && !(chunk->chunk_hdr->flags & SCTP_DATA_FIRST_FRAG)) return 0; if (SCTP_PR_TTL_ENABLED(chunk->sinfo.sinfo_flags) && time_after(jiffies, chunk->msg->expires_at)) { struct sctp_stream_out *streamout = SCTP_SO(&chunk->asoc->stream, chunk->sinfo.sinfo_stream); if (chunk->sent_count) { chunk->asoc->abandoned_sent[SCTP_PR_INDEX(TTL)]++; streamout->ext->abandoned_sent[SCTP_PR_INDEX(TTL)]++; } else { chunk->asoc->abandoned_unsent[SCTP_PR_INDEX(TTL)]++; streamout->ext->abandoned_unsent[SCTP_PR_INDEX(TTL)]++; } chunk->msg->abandoned = 1; return 1; } else if (SCTP_PR_RTX_ENABLED(chunk->sinfo.sinfo_flags) && chunk->sent_count > chunk->sinfo.sinfo_timetolive) { struct sctp_stream_out *streamout = SCTP_SO(&chunk->asoc->stream, chunk->sinfo.sinfo_stream); chunk->asoc->abandoned_sent[SCTP_PR_INDEX(RTX)]++; streamout->ext->abandoned_sent[SCTP_PR_INDEX(RTX)]++; chunk->msg->abandoned = 1; return 1; } else if (!SCTP_PR_POLICY(chunk->sinfo.sinfo_flags) && chunk->msg->expires_at && time_after(jiffies, chunk->msg->expires_at)) { chunk->msg->abandoned = 1; return 1; } /* PRIO policy is processed by sendmsg, not here */ return 0; } /* This chunk (and consequently entire message) has failed in its sending. */ void sctp_chunk_fail(struct sctp_chunk *chunk, int error) { chunk->msg->send_failed = 1; chunk->msg->send_error = error; } |
3427 3429 3426 3428 3429 3429 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 | // SPDX-License-Identifier: GPL-2.0-or-later /* * The "hash function" used as the core of the ChaCha stream cipher (RFC7539) * * Copyright (C) 2015 Martin Willi */ #include <linux/bug.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/bitops.h> #include <linux/string.h> #include <asm/unaligned.h> #include <crypto/chacha.h> static void chacha_permute(u32 *x, int nrounds) { int i; /* whitelist the allowed round counts */ WARN_ON_ONCE(nrounds != 20 && nrounds != 12); for (i = 0; i < nrounds; i += 2) { x[0] += x[4]; x[12] = rol32(x[12] ^ x[0], 16); x[1] += x[5]; x[13] = rol32(x[13] ^ x[1], 16); x[2] += x[6]; x[14] = rol32(x[14] ^ x[2], 16); x[3] += x[7]; x[15] = rol32(x[15] ^ x[3], 16); x[8] += x[12]; x[4] = rol32(x[4] ^ x[8], 12); x[9] += x[13]; x[5] = rol32(x[5] ^ x[9], 12); x[10] += x[14]; x[6] = rol32(x[6] ^ x[10], 12); x[11] += x[15]; x[7] = rol32(x[7] ^ x[11], 12); x[0] += x[4]; x[12] = rol32(x[12] ^ x[0], 8); x[1] += x[5]; x[13] = rol32(x[13] ^ x[1], 8); x[2] += x[6]; x[14] = rol32(x[14] ^ x[2], 8); x[3] += x[7]; x[15] = rol32(x[15] ^ x[3], 8); x[8] += x[12]; x[4] = rol32(x[4] ^ x[8], 7); x[9] += x[13]; x[5] = rol32(x[5] ^ x[9], 7); x[10] += x[14]; x[6] = rol32(x[6] ^ x[10], 7); x[11] += x[15]; x[7] = rol32(x[7] ^ x[11], 7); x[0] += x[5]; x[15] = rol32(x[15] ^ x[0], 16); x[1] += x[6]; x[12] = rol32(x[12] ^ x[1], 16); x[2] += x[7]; x[13] = rol32(x[13] ^ x[2], 16); x[3] += x[4]; x[14] = rol32(x[14] ^ x[3], 16); x[10] += x[15]; x[5] = rol32(x[5] ^ x[10], 12); x[11] += x[12]; x[6] = rol32(x[6] ^ x[11], 12); x[8] += x[13]; x[7] = rol32(x[7] ^ x[8], 12); x[9] += x[14]; x[4] = rol32(x[4] ^ x[9], 12); x[0] += x[5]; x[15] = rol32(x[15] ^ x[0], 8); x[1] += x[6]; x[12] = rol32(x[12] ^ x[1], 8); x[2] += x[7]; x[13] = rol32(x[13] ^ x[2], 8); x[3] += x[4]; x[14] = rol32(x[14] ^ x[3], 8); x[10] += x[15]; x[5] = rol32(x[5] ^ x[10], 7); x[11] += x[12]; x[6] = rol32(x[6] ^ x[11], 7); x[8] += x[13]; x[7] = rol32(x[7] ^ x[8], 7); x[9] += x[14]; x[4] = rol32(x[4] ^ x[9], 7); } } /** * chacha_block_generic - generate one keystream block and increment block counter * @state: input state matrix (16 32-bit words) * @stream: output keystream block (64 bytes) * @nrounds: number of rounds (20 or 12; 20 is recommended) * * This is the ChaCha core, a function from 64-byte strings to 64-byte strings. * The caller has already converted the endianness of the input. This function * also handles incrementing the block counter in the input matrix. */ void chacha_block_generic(u32 *state, u8 *stream, int nrounds) { u32 x[16]; int i; memcpy(x, state, 64); chacha_permute(x, nrounds); for (i = 0; i < ARRAY_SIZE(x); i++) put_unaligned_le32(x[i] + state[i], &stream[i * sizeof(u32)]); state[12]++; } EXPORT_SYMBOL(chacha_block_generic); /** * hchacha_block_generic - abbreviated ChaCha core, for XChaCha * @state: input state matrix (16 32-bit words) * @stream: output (8 32-bit words) * @nrounds: number of rounds (20 or 12; 20 is recommended) * * HChaCha is the ChaCha equivalent of HSalsa and is an intermediate step * towards XChaCha (see https://cr.yp.to/snuffle/xsalsa-20081128.pdf). HChaCha * skips the final addition of the initial state, and outputs only certain words * of the state. It should not be used for streaming directly. */ void hchacha_block_generic(const u32 *state, u32 *stream, int nrounds) { u32 x[16]; memcpy(x, state, 64); chacha_permute(x, nrounds); memcpy(&stream[0], &x[0], 16); memcpy(&stream[4], &x[12], 16); } EXPORT_SYMBOL(hchacha_block_generic); |
97 89 89 97 89 89 89 3 89 20 20 20 20 20 7 7 7 7 7 7 7 7 7 3 3 3 24 20 24 24 24 24 24 24 24 24 24 24 10 20 20 20 24 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 | // SPDX-License-Identifier: GPL-2.0-or-later /* Generic associative array implementation. * * See Documentation/core-api/assoc_array.rst for information. * * Copyright (C) 2013 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ //#define DEBUG #include <linux/rcupdate.h> #include <linux/slab.h> #include <linux/err.h> #include <linux/assoc_array_priv.h> /* * Iterate over an associative array. The caller must hold the RCU read lock * or better. */ static int assoc_array_subtree_iterate(const struct assoc_array_ptr *root, const struct assoc_array_ptr *stop, int (*iterator)(const void *leaf, void *iterator_data), void *iterator_data) { const struct assoc_array_shortcut *shortcut; const struct assoc_array_node *node; const struct assoc_array_ptr *cursor, *ptr, *parent; unsigned long has_meta; int slot, ret; cursor = root; begin_node: if (assoc_array_ptr_is_shortcut(cursor)) { /* Descend through a shortcut */ shortcut = assoc_array_ptr_to_shortcut(cursor); cursor = READ_ONCE(shortcut->next_node); /* Address dependency. */ } node = assoc_array_ptr_to_node(cursor); slot = 0; /* We perform two passes of each node. * * The first pass does all the leaves in this node. This means we * don't miss any leaves if the node is split up by insertion whilst * we're iterating over the branches rooted here (we may, however, see * some leaves twice). */ has_meta = 0; for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) { ptr = READ_ONCE(node->slots[slot]); /* Address dependency. */ has_meta |= (unsigned long)ptr; if (ptr && assoc_array_ptr_is_leaf(ptr)) { /* We need a barrier between the read of the pointer, * which is supplied by the above READ_ONCE(). */ /* Invoke the callback */ ret = iterator(assoc_array_ptr_to_leaf(ptr), iterator_data); if (ret) return ret; } } /* The second pass attends to all the metadata pointers. If we follow * one of these we may find that we don't come back here, but rather go * back to a replacement node with the leaves in a different layout. * * We are guaranteed to make progress, however, as the slot number for * a particular portion of the key space cannot change - and we * continue at the back pointer + 1. */ if (!(has_meta & ASSOC_ARRAY_PTR_META_TYPE)) goto finished_node; slot = 0; continue_node: node = assoc_array_ptr_to_node(cursor); for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) { ptr = READ_ONCE(node->slots[slot]); /* Address dependency. */ if (assoc_array_ptr_is_meta(ptr)) { cursor = ptr; goto begin_node; } } finished_node: /* Move up to the parent (may need to skip back over a shortcut) */ parent = READ_ONCE(node->back_pointer); /* Address dependency. */ slot = node->parent_slot; if (parent == stop) return 0; if (assoc_array_ptr_is_shortcut(parent)) { shortcut = assoc_array_ptr_to_shortcut(parent); cursor = parent; parent = READ_ONCE(shortcut->back_pointer); /* Address dependency. */ slot = shortcut->parent_slot; if (parent == stop) return 0; } /* Ascend to next slot in parent node */ cursor = parent; slot++; goto continue_node; } /** * assoc_array_iterate - Pass all objects in the array to a callback * @array: The array to iterate over. * @iterator: The callback function. * @iterator_data: Private data for the callback function. * * Iterate over all the objects in an associative array. Each one will be * presented to the iterator function. * * If the array is being modified concurrently with the iteration then it is * possible that some objects in the array will be passed to the iterator * callback more than once - though every object should be passed at least * once. If this is undesirable then the caller must lock against modification * for the duration of this function. * * The function will return 0 if no objects were in the array or else it will * return the result of the last iterator function called. Iteration stops * immediately if any call to the iteration function results in a non-zero * return. * * The caller should hold the RCU read lock or better if concurrent * modification is possible. */ int assoc_array_iterate(const struct assoc_array *array, int (*iterator)(const void *object, void *iterator_data), void *iterator_data) { struct assoc_array_ptr *root = READ_ONCE(array->root); /* Address dependency. */ if (!root) return 0; return assoc_array_subtree_iterate(root, NULL, iterator, iterator_data); } enum assoc_array_walk_status { assoc_array_walk_tree_empty, assoc_array_walk_found_terminal_node, assoc_array_walk_found_wrong_shortcut, }; struct assoc_array_walk_result { struct { struct assoc_array_node *node; /* Node in which leaf might be found */ int level; int slot; } terminal_node; struct { struct assoc_array_shortcut *shortcut; int level; int sc_level; unsigned long sc_segments; unsigned long dissimilarity; } wrong_shortcut; }; /* * Navigate through the internal tree looking for the closest node to the key. */ static enum assoc_array_walk_status assoc_array_walk(const struct assoc_array *array, const struct assoc_array_ops *ops, const void *index_key, struct assoc_array_walk_result *result) { struct assoc_array_shortcut *shortcut; struct assoc_array_node *node; struct assoc_array_ptr *cursor, *ptr; unsigned long sc_segments, dissimilarity; unsigned long segments; int level, sc_level, next_sc_level; int slot; pr_devel("-->%s()\n", __func__); cursor = READ_ONCE(array->root); /* Address dependency. */ if (!cursor) return assoc_array_walk_tree_empty; level = 0; /* Use segments from the key for the new leaf to navigate through the * internal tree, skipping through nodes and shortcuts that are on * route to the destination. Eventually we'll come to a slot that is * either empty or contains a leaf at which point we've found a node in * which the leaf we're looking for might be found or into which it * should be inserted. */ jumped: segments = ops->get_key_chunk(index_key, level); pr_devel("segments[%d]: %lx\n", level, segments); if (assoc_array_ptr_is_shortcut(cursor)) goto follow_shortcut; consider_node: node = assoc_array_ptr_to_node(cursor); slot = segments >> (level & ASSOC_ARRAY_KEY_CHUNK_MASK); slot &= ASSOC_ARRAY_FAN_MASK; ptr = READ_ONCE(node->slots[slot]); /* Address dependency. */ pr_devel("consider slot %x [ix=%d type=%lu]\n", slot, level, (unsigned long)ptr & 3); if (!assoc_array_ptr_is_meta(ptr)) { /* The node doesn't have a node/shortcut pointer in the slot * corresponding to the index key that we have to follow. */ result->terminal_node.node = node; result->terminal_node.level = level; result->terminal_node.slot = slot; pr_devel("<--%s() = terminal_node\n", __func__); return assoc_array_walk_found_terminal_node; } if (assoc_array_ptr_is_node(ptr)) { /* There is a pointer to a node in the slot corresponding to * this index key segment, so we need to follow it. */ cursor = ptr; level += ASSOC_ARRAY_LEVEL_STEP; if ((level & ASSOC_ARRAY_KEY_CHUNK_MASK) != 0) goto consider_node; goto jumped; } /* There is a shortcut in the slot corresponding to the index key * segment. We follow the shortcut if its partial index key matches * this leaf's. Otherwise we need to split the shortcut. */ cursor = ptr; follow_shortcut: shortcut = assoc_array_ptr_to_shortcut(cursor); pr_devel("shortcut to %d\n", shortcut->skip_to_level); sc_level = level + ASSOC_ARRAY_LEVEL_STEP; BUG_ON(sc_level > shortcut->skip_to_level); do { /* Check the leaf against the shortcut's index key a word at a * time, trimming the final word (the shortcut stores the index * key completely from the root to the shortcut's target). */ if ((sc_level & ASSOC_ARRAY_KEY_CHUNK_MASK) == 0) segments = ops->get_key_chunk(index_key, sc_level); sc_segments = shortcut->index_key[sc_level >> ASSOC_ARRAY_KEY_CHUNK_SHIFT]; dissimilarity = segments ^ sc_segments; if (round_up(sc_level, ASSOC_ARRAY_KEY_CHUNK_SIZE) > shortcut->skip_to_level) { /* Trim segments that are beyond the shortcut */ int shift = shortcut->skip_to_level & ASSOC_ARRAY_KEY_CHUNK_MASK; dissimilarity &= ~(ULONG_MAX << shift); next_sc_level = shortcut->skip_to_level; } else { next_sc_level = sc_level + ASSOC_ARRAY_KEY_CHUNK_SIZE; next_sc_level = round_down(next_sc_level, ASSOC_ARRAY_KEY_CHUNK_SIZE); } if (dissimilarity != 0) { /* This shortcut points elsewhere */ result->wrong_shortcut.shortcut = shortcut; result->wrong_shortcut.level = level; result->wrong_shortcut.sc_level = sc_level; result->wrong_shortcut.sc_segments = sc_segments; result->wrong_shortcut.dissimilarity = dissimilarity; return assoc_array_walk_found_wrong_shortcut; } sc_level = next_sc_level; } while (sc_level < shortcut->skip_to_level); /* The shortcut matches the leaf's index to this point. */ cursor = READ_ONCE(shortcut->next_node); /* Address dependency. */ if (((level ^ sc_level) & ~ASSOC_ARRAY_KEY_CHUNK_MASK) != 0) { level = sc_level; goto jumped; } else { level = sc_level; goto consider_node; } } /** * assoc_array_find - Find an object by index key * @array: The associative array to search. * @ops: The operations to use. * @index_key: The key to the object. * * Find an object in an associative array by walking through the internal tree * to the node that should contain the object and then searching the leaves * there. NULL is returned if the requested object was not found in the array. * * The caller must hold the RCU read lock or better. */ void *assoc_array_find(const struct assoc_array *array, const struct assoc_array_ops *ops, const void *index_key) { struct assoc_array_walk_result result; const struct assoc_array_node *node; const struct assoc_array_ptr *ptr; const void *leaf; int slot; if (assoc_array_walk(array, ops, index_key, &result) != assoc_array_walk_found_terminal_node) return NULL; node = result.terminal_node.node; /* If the target key is available to us, it's has to be pointed to by * the terminal node. */ for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) { ptr = READ_ONCE(node->slots[slot]); /* Address dependency. */ if (ptr && assoc_array_ptr_is_leaf(ptr)) { /* We need a barrier between the read of the pointer * and dereferencing the pointer - but only if we are * actually going to dereference it. */ leaf = assoc_array_ptr_to_leaf(ptr); if (ops->compare_object(leaf, index_key)) return (void *)leaf; } } return NULL; } /* * Destructively iterate over an associative array. The caller must prevent * other simultaneous accesses. */ static void assoc_array_destroy_subtree(struct assoc_array_ptr *root, const struct assoc_array_ops *ops) { struct assoc_array_shortcut *shortcut; struct assoc_array_node *node; struct assoc_array_ptr *cursor, *parent = NULL; int slot = -1; pr_devel("-->%s()\n", __func__); cursor = root; if (!cursor) { pr_devel("empty\n"); return; } move_to_meta: if (assoc_array_ptr_is_shortcut(cursor)) { /* Descend through a shortcut */ pr_devel("[%d] shortcut\n", slot); BUG_ON(!assoc_array_ptr_is_shortcut(cursor)); shortcut = assoc_array_ptr_to_shortcut(cursor); BUG_ON(shortcut->back_pointer != parent); BUG_ON(slot != -1 && shortcut->parent_slot != slot); parent = cursor; cursor = shortcut->next_node; slot = -1; BUG_ON(!assoc_array_ptr_is_node(cursor)); } pr_devel("[%d] node\n", slot); node = assoc_array_ptr_to_node(cursor); BUG_ON(node->back_pointer != parent); BUG_ON(slot != -1 && node->parent_slot != slot); slot = 0; continue_node: pr_devel("Node %p [back=%p]\n", node, node->back_pointer); for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) { struct assoc_array_ptr *ptr = node->slots[slot]; if (!ptr) continue; if (assoc_array_ptr_is_meta(ptr)) { parent = cursor; cursor = ptr; goto move_to_meta; } if (ops) { pr_devel("[%d] free leaf\n", slot); ops->free_object(assoc_array_ptr_to_leaf(ptr)); } } parent = node->back_pointer; slot = node->parent_slot; pr_devel("free node\n"); kfree(node); if (!parent) return; /* Done */ /* Move back up to the parent (may need to free a shortcut on * the way up) */ if (assoc_array_ptr_is_shortcut(parent)) { shortcut = assoc_array_ptr_to_shortcut(parent); BUG_ON(shortcut->next_node != cursor); cursor = parent; parent = shortcut->back_pointer; slot = shortcut->parent_slot; pr_devel("free shortcut\n"); kfree(shortcut); if (!parent) return; BUG_ON(!assoc_array_ptr_is_node(parent)); } /* Ascend to next slot in parent node */ pr_devel("ascend to %p[%d]\n", parent, slot); cursor = parent; node = assoc_array_ptr_to_node(cursor); slot++; goto continue_node; } /** * assoc_array_destroy - Destroy an associative array * @array: The array to destroy. * @ops: The operations to use. * * Discard all metadata and free all objects in an associative array. The * array will be empty and ready to use again upon completion. This function * cannot fail. * * The caller must prevent all other accesses whilst this takes place as no * attempt is made to adjust pointers gracefully to permit RCU readlock-holding * accesses to continue. On the other hand, no memory allocation is required. */ void assoc_array_destroy(struct assoc_array *array, const struct assoc_array_ops *ops) { assoc_array_destroy_subtree(array->root, ops); array->root = NULL; } /* * Handle insertion into an empty tree. */ static bool assoc_array_insert_in_empty_tree(struct assoc_array_edit *edit) { struct assoc_array_node *new_n0; pr_devel("-->%s()\n", __func__); new_n0 = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL); if (!new_n0) return false; edit->new_meta[0] = assoc_array_node_to_ptr(new_n0); edit->leaf_p = &new_n0->slots[0]; edit->adjust_count_on = new_n0; edit->set[0].ptr = &edit->array->root; edit->set[0].to = assoc_array_node_to_ptr(new_n0); pr_devel("<--%s() = ok [no root]\n", __func__); return true; } /* * Handle insertion into a terminal node. */ static bool assoc_array_insert_into_terminal_node(struct assoc_array_edit *edit, const struct assoc_array_ops *ops, const void *index_key, struct assoc_array_walk_result *result) { struct assoc_array_shortcut *shortcut, *new_s0; struct assoc_array_node *node, *new_n0, *new_n1, *side; struct assoc_array_ptr *ptr; unsigned long dissimilarity, base_seg, blank; size_t keylen; bool have_meta; int level, diff; int slot, next_slot, free_slot, i, j; node = result->terminal_node.node; level = result->terminal_node.level; edit->segment_cache[ASSOC_ARRAY_FAN_OUT] = result->terminal_node.slot; pr_devel("-->%s()\n", __func__); /* We arrived at a node which doesn't have an onward node or shortcut * pointer that we have to follow. This means that (a) the leaf we * want must go here (either by insertion or replacement) or (b) we * need to split this node and insert in one of the fragments. */ free_slot = -1; /* Firstly, we have to check the leaves in this node to see if there's * a matching one we should replace in place. */ for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { ptr = node->slots[i]; if (!ptr) { free_slot = i; continue; } if (assoc_array_ptr_is_leaf(ptr) && ops->compare_object(assoc_array_ptr_to_leaf(ptr), index_key)) { pr_devel("replace in slot %d\n", i); edit->leaf_p = &node->slots[i]; edit->dead_leaf = node->slots[i]; pr_devel("<--%s() = ok [replace]\n", __func__); return true; } } /* If there is a free slot in this node then we can just insert the * leaf here. */ if (free_slot >= 0) { pr_devel("insert in free slot %d\n", free_slot); edit->leaf_p = &node->slots[free_slot]; edit->adjust_count_on = node; pr_devel("<--%s() = ok [insert]\n", __func__); return true; } /* The node has no spare slots - so we're either going to have to split * it or insert another node before it. * * Whatever, we're going to need at least two new nodes - so allocate * those now. We may also need a new shortcut, but we deal with that * when we need it. */ new_n0 = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL); if (!new_n0) return false; edit->new_meta[0] = assoc_array_node_to_ptr(new_n0); new_n1 = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL); if (!new_n1) return false; edit->new_meta[1] = assoc_array_node_to_ptr(new_n1); /* We need to find out how similar the leaves are. */ pr_devel("no spare slots\n"); have_meta = false; for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { ptr = node->slots[i]; if (assoc_array_ptr_is_meta(ptr)) { edit->segment_cache[i] = 0xff; have_meta = true; continue; } base_seg = ops->get_object_key_chunk( assoc_array_ptr_to_leaf(ptr), level); base_seg >>= level & ASSOC_ARRAY_KEY_CHUNK_MASK; edit->segment_cache[i] = base_seg & ASSOC_ARRAY_FAN_MASK; } if (have_meta) { pr_devel("have meta\n"); goto split_node; } /* The node contains only leaves */ dissimilarity = 0; base_seg = edit->segment_cache[0]; for (i = 1; i < ASSOC_ARRAY_FAN_OUT; i++) dissimilarity |= edit->segment_cache[i] ^ base_seg; pr_devel("only leaves; dissimilarity=%lx\n", dissimilarity); if ((dissimilarity & ASSOC_ARRAY_FAN_MASK) == 0) { /* The old leaves all cluster in the same slot. We will need * to insert a shortcut if the new node wants to cluster with them. */ if ((edit->segment_cache[ASSOC_ARRAY_FAN_OUT] ^ base_seg) == 0) goto all_leaves_cluster_together; /* Otherwise all the old leaves cluster in the same slot, but * the new leaf wants to go into a different slot - so we * create a new node (n0) to hold the new leaf and a pointer to * a new node (n1) holding all the old leaves. * * This can be done by falling through to the node splitting * path. */ pr_devel("present leaves cluster but not new leaf\n"); } split_node: pr_devel("split node\n"); /* We need to split the current node. The node must contain anything * from a single leaf (in the one leaf case, this leaf will cluster * with the new leaf) and the rest meta-pointers, to all leaves, some * of which may cluster. * * It won't contain the case in which all the current leaves plus the * new leaves want to cluster in the same slot. * * We need to expel at least two leaves out of a set consisting of the * leaves in the node and the new leaf. The current meta pointers can * just be copied as they shouldn't cluster with any of the leaves. * * We need a new node (n0) to replace the current one and a new node to * take the expelled nodes (n1). */ edit->set[0].to = assoc_array_node_to_ptr(new_n0); new_n0->back_pointer = node->back_pointer; new_n0->parent_slot = node->parent_slot; new_n1->back_pointer = assoc_array_node_to_ptr(new_n0); new_n1->parent_slot = -1; /* Need to calculate this */ do_split_node: pr_devel("do_split_node\n"); new_n0->nr_leaves_on_branch = node->nr_leaves_on_branch; new_n1->nr_leaves_on_branch = 0; /* Begin by finding two matching leaves. There have to be at least two * that match - even if there are meta pointers - because any leaf that * would match a slot with a meta pointer in it must be somewhere * behind that meta pointer and cannot be here. Further, given N * remaining leaf slots, we now have N+1 leaves to go in them. */ for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { slot = edit->segment_cache[i]; if (slot != 0xff) for (j = i + 1; j < ASSOC_ARRAY_FAN_OUT + 1; j++) if (edit->segment_cache[j] == slot) goto found_slot_for_multiple_occupancy; } found_slot_for_multiple_occupancy: pr_devel("same slot: %x %x [%02x]\n", i, j, slot); BUG_ON(i >= ASSOC_ARRAY_FAN_OUT); BUG_ON(j >= ASSOC_ARRAY_FAN_OUT + 1); BUG_ON(slot >= ASSOC_ARRAY_FAN_OUT); new_n1->parent_slot = slot; /* Metadata pointers cannot change slot */ for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) if (assoc_array_ptr_is_meta(node->slots[i])) new_n0->slots[i] = node->slots[i]; else new_n0->slots[i] = NULL; BUG_ON(new_n0->slots[slot] != NULL); new_n0->slots[slot] = assoc_array_node_to_ptr(new_n1); /* Filter the leaf pointers between the new nodes */ free_slot = -1; next_slot = 0; for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { if (assoc_array_ptr_is_meta(node->slots[i])) continue; if (edit->segment_cache[i] == slot) { new_n1->slots[next_slot++] = node->slots[i]; new_n1->nr_leaves_on_branch++; } else { do { free_slot++; } while (new_n0->slots[free_slot] != NULL); new_n0->slots[free_slot] = node->slots[i]; } } pr_devel("filtered: f=%x n=%x\n", free_slot, next_slot); if (edit->segment_cache[ASSOC_ARRAY_FAN_OUT] != slot) { do { free_slot++; } while (new_n0->slots[free_slot] != NULL); edit->leaf_p = &new_n0->slots[free_slot]; edit->adjust_count_on = new_n0; } else { edit->leaf_p = &new_n1->slots[next_slot++]; edit->adjust_count_on = new_n1; } BUG_ON(next_slot <= 1); edit->set_backpointers_to = assoc_array_node_to_ptr(new_n0); for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { if (edit->segment_cache[i] == 0xff) { ptr = node->slots[i]; BUG_ON(assoc_array_ptr_is_leaf(ptr)); if (assoc_array_ptr_is_node(ptr)) { side = assoc_array_ptr_to_node(ptr); edit->set_backpointers[i] = &side->back_pointer; } else { shortcut = assoc_array_ptr_to_shortcut(ptr); edit->set_backpointers[i] = &shortcut->back_pointer; } } } ptr = node->back_pointer; if (!ptr) edit->set[0].ptr = &edit->array->root; else if (assoc_array_ptr_is_node(ptr)) edit->set[0].ptr = &assoc_array_ptr_to_node(ptr)->slots[node->parent_slot]; else edit->set[0].ptr = &assoc_array_ptr_to_shortcut(ptr)->next_node; edit->excised_meta[0] = assoc_array_node_to_ptr(node); pr_devel("<--%s() = ok [split node]\n", __func__); return true; all_leaves_cluster_together: /* All the leaves, new and old, want to cluster together in this node * in the same slot, so we have to replace this node with a shortcut to * skip over the identical parts of the key and then place a pair of * nodes, one inside the other, at the end of the shortcut and * distribute the keys between them. * * Firstly we need to work out where the leaves start diverging as a * bit position into their keys so that we know how big the shortcut * needs to be. * * We only need to make a single pass of N of the N+1 leaves because if * any keys differ between themselves at bit X then at least one of * them must also differ with the base key at bit X or before. */ pr_devel("all leaves cluster together\n"); diff = INT_MAX; for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { int x = ops->diff_objects(assoc_array_ptr_to_leaf(node->slots[i]), index_key); if (x < diff) { BUG_ON(x < 0); diff = x; } } BUG_ON(diff == INT_MAX); BUG_ON(diff < level + ASSOC_ARRAY_LEVEL_STEP); keylen = round_up(diff, ASSOC_ARRAY_KEY_CHUNK_SIZE); keylen >>= ASSOC_ARRAY_KEY_CHUNK_SHIFT; new_s0 = kzalloc(struct_size(new_s0, index_key, keylen), GFP_KERNEL); if (!new_s0) return false; edit->new_meta[2] = assoc_array_shortcut_to_ptr(new_s0); edit->set[0].to = assoc_array_shortcut_to_ptr(new_s0); new_s0->back_pointer = node->back_pointer; new_s0->parent_slot = node->parent_slot; new_s0->next_node = assoc_array_node_to_ptr(new_n0); new_n0->back_pointer = assoc_array_shortcut_to_ptr(new_s0); new_n0->parent_slot = 0; new_n1->back_pointer = assoc_array_node_to_ptr(new_n0); new_n1->parent_slot = -1; /* Need to calculate this */ new_s0->skip_to_level = level = diff & ~ASSOC_ARRAY_LEVEL_STEP_MASK; pr_devel("skip_to_level = %d [diff %d]\n", level, diff); BUG_ON(level <= 0); for (i = 0; i < keylen; i++) new_s0->index_key[i] = ops->get_key_chunk(index_key, i * ASSOC_ARRAY_KEY_CHUNK_SIZE); if (level & ASSOC_ARRAY_KEY_CHUNK_MASK) { blank = ULONG_MAX << (level & ASSOC_ARRAY_KEY_CHUNK_MASK); pr_devel("blank off [%zu] %d: %lx\n", keylen - 1, level, blank); new_s0->index_key[keylen - 1] &= ~blank; } /* This now reduces to a node splitting exercise for which we'll need * to regenerate the disparity table. */ for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { ptr = node->slots[i]; base_seg = ops->get_object_key_chunk(assoc_array_ptr_to_leaf(ptr), level); base_seg >>= level & ASSOC_ARRAY_KEY_CHUNK_MASK; edit->segment_cache[i] = base_seg & ASSOC_ARRAY_FAN_MASK; } base_seg = ops->get_key_chunk(index_key, level); base_seg >>= level & ASSOC_ARRAY_KEY_CHUNK_MASK; edit->segment_cache[ASSOC_ARRAY_FAN_OUT] = base_seg & ASSOC_ARRAY_FAN_MASK; goto do_split_node; } /* * Handle insertion into the middle of a shortcut. */ static bool assoc_array_insert_mid_shortcut(struct assoc_array_edit *edit, const struct assoc_array_ops *ops, struct assoc_array_walk_result *result) { struct assoc_array_shortcut *shortcut, *new_s0, *new_s1; struct assoc_array_node *node, *new_n0, *side; unsigned long sc_segments, dissimilarity, blank; size_t keylen; int level, sc_level, diff; int sc_slot; shortcut = result->wrong_shortcut.shortcut; level = result->wrong_shortcut.level; sc_level = result->wrong_shortcut.sc_level; sc_segments = result->wrong_shortcut.sc_segments; dissimilarity = result->wrong_shortcut.dissimilarity; pr_devel("-->%s(ix=%d dis=%lx scix=%d)\n", __func__, level, dissimilarity, sc_level); /* We need to split a shortcut and insert a node between the two * pieces. Zero-length pieces will be dispensed with entirely. * * First of all, we need to find out in which level the first * difference was. */ diff = __ffs(dissimilarity); diff &= ~ASSOC_ARRAY_LEVEL_STEP_MASK; diff += sc_level & ~ASSOC_ARRAY_KEY_CHUNK_MASK; pr_devel("diff=%d\n", diff); if (!shortcut->back_pointer) { edit->set[0].ptr = &edit->array->root; } else if (assoc_array_ptr_is_node(shortcut->back_pointer)) { node = assoc_array_ptr_to_node(shortcut->back_pointer); edit->set[0].ptr = &node->slots[shortcut->parent_slot]; } else { BUG(); } edit->excised_meta[0] = assoc_array_shortcut_to_ptr(shortcut); /* Create a new node now since we're going to need it anyway */ new_n0 = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL); if (!new_n0) return false; edit->new_meta[0] = assoc_array_node_to_ptr(new_n0); edit->adjust_count_on = new_n0; /* Insert a new shortcut before the new node if this segment isn't of * zero length - otherwise we just connect the new node directly to the * parent. */ level += ASSOC_ARRAY_LEVEL_STEP; if (diff > level) { pr_devel("pre-shortcut %d...%d\n", level, diff); keylen = round_up(diff, ASSOC_ARRAY_KEY_CHUNK_SIZE); keylen >>= ASSOC_ARRAY_KEY_CHUNK_SHIFT; new_s0 = kzalloc(struct_size(new_s0, index_key, keylen), GFP_KERNEL); if (!new_s0) return false; edit->new_meta[1] = assoc_array_shortcut_to_ptr(new_s0); edit->set[0].to = assoc_array_shortcut_to_ptr(new_s0); new_s0->back_pointer = shortcut->back_pointer; new_s0->parent_slot = shortcut->parent_slot; new_s0->next_node = assoc_array_node_to_ptr(new_n0); new_s0->skip_to_level = diff; new_n0->back_pointer = assoc_array_shortcut_to_ptr(new_s0); new_n0->parent_slot = 0; memcpy(new_s0->index_key, shortcut->index_key, flex_array_size(new_s0, index_key, keylen)); blank = ULONG_MAX << (diff & ASSOC_ARRAY_KEY_CHUNK_MASK); pr_devel("blank off [%zu] %d: %lx\n", keylen - 1, diff, blank); new_s0->index_key[keylen - 1] &= ~blank; } else { pr_devel("no pre-shortcut\n"); edit->set[0].to = assoc_array_node_to_ptr(new_n0); new_n0->back_pointer = shortcut->back_pointer; new_n0->parent_slot = shortcut->parent_slot; } side = assoc_array_ptr_to_node(shortcut->next_node); new_n0->nr_leaves_on_branch = side->nr_leaves_on_branch; /* We need to know which slot in the new node is going to take a * metadata pointer. */ sc_slot = sc_segments >> (diff & ASSOC_ARRAY_KEY_CHUNK_MASK); sc_slot &= ASSOC_ARRAY_FAN_MASK; pr_devel("new slot %lx >> %d -> %d\n", sc_segments, diff & ASSOC_ARRAY_KEY_CHUNK_MASK, sc_slot); /* Determine whether we need to follow the new node with a replacement * for the current shortcut. We could in theory reuse the current * shortcut if its parent slot number doesn't change - but that's a * 1-in-16 chance so not worth expending the code upon. */ level = diff + ASSOC_ARRAY_LEVEL_STEP; if (level < shortcut->skip_to_level) { pr_devel("post-shortcut %d...%d\n", level, shortcut->skip_to_level); keylen = round_up(shortcut->skip_to_level, ASSOC_ARRAY_KEY_CHUNK_SIZE); keylen >>= ASSOC_ARRAY_KEY_CHUNK_SHIFT; new_s1 = kzalloc(struct_size(new_s1, index_key, keylen), GFP_KERNEL); if (!new_s1) return false; edit->new_meta[2] = assoc_array_shortcut_to_ptr(new_s1); new_s1->back_pointer = assoc_array_node_to_ptr(new_n0); new_s1->parent_slot = sc_slot; new_s1->next_node = shortcut->next_node; new_s1->skip_to_level = shortcut->skip_to_level; new_n0->slots[sc_slot] = assoc_array_shortcut_to_ptr(new_s1); memcpy(new_s1->index_key, shortcut->index_key, flex_array_size(new_s1, index_key, keylen)); edit->set[1].ptr = &side->back_pointer; edit->set[1].to = assoc_array_shortcut_to_ptr(new_s1); } else { pr_devel("no post-shortcut\n"); /* We don't have to replace the pointed-to node as long as we * use memory barriers to make sure the parent slot number is * changed before the back pointer (the parent slot number is * irrelevant to the old parent shortcut). */ new_n0->slots[sc_slot] = shortcut->next_node; edit->set_parent_slot[0].p = &side->parent_slot; edit->set_parent_slot[0].to = sc_slot; edit->set[1].ptr = &side->back_pointer; edit->set[1].to = assoc_array_node_to_ptr(new_n0); } /* Install the new leaf in a spare slot in the new node. */ if (sc_slot == 0) edit->leaf_p = &new_n0->slots[1]; else edit->leaf_p = &new_n0->slots[0]; pr_devel("<--%s() = ok [split shortcut]\n", __func__); return edit; } /** * assoc_array_insert - Script insertion of an object into an associative array * @array: The array to insert into. * @ops: The operations to use. * @index_key: The key to insert at. * @object: The object to insert. * * Precalculate and preallocate a script for the insertion or replacement of an * object in an associative array. This results in an edit script that can * either be applied or cancelled. * * The function returns a pointer to an edit script or -ENOMEM. * * The caller should lock against other modifications and must continue to hold * the lock until assoc_array_apply_edit() has been called. * * Accesses to the tree may take place concurrently with this function, * provided they hold the RCU read lock. */ struct assoc_array_edit *assoc_array_insert(struct assoc_array *array, const struct assoc_array_ops *ops, const void *index_key, void *object) { struct assoc_array_walk_result result; struct assoc_array_edit *edit; pr_devel("-->%s()\n", __func__); /* The leaf pointer we're given must not have the bottom bit set as we * use those for type-marking the pointer. NULL pointers are also not * allowed as they indicate an empty slot but we have to allow them * here as they can be updated later. */ BUG_ON(assoc_array_ptr_is_meta(object)); edit = kzalloc(sizeof(struct assoc_array_edit), GFP_KERNEL); if (!edit) return ERR_PTR(-ENOMEM); edit->array = array; edit->ops = ops; edit->leaf = assoc_array_leaf_to_ptr(object); edit->adjust_count_by = 1; switch (assoc_array_walk(array, ops, index_key, &result)) { case assoc_array_walk_tree_empty: /* Allocate a root node if there isn't one yet */ if (!assoc_array_insert_in_empty_tree(edit)) goto enomem; return edit; case assoc_array_walk_found_terminal_node: /* We found a node that doesn't have a node/shortcut pointer in * the slot corresponding to the index key that we have to * follow. */ if (!assoc_array_insert_into_terminal_node(edit, ops, index_key, &result)) goto enomem; return edit; case assoc_array_walk_found_wrong_shortcut: /* We found a shortcut that didn't match our key in a slot we * needed to follow. */ if (!assoc_array_insert_mid_shortcut(edit, ops, &result)) goto enomem; return edit; } enomem: /* Clean up after an out of memory error */ pr_devel("enomem\n"); assoc_array_cancel_edit(edit); return ERR_PTR(-ENOMEM); } /** * assoc_array_insert_set_object - Set the new object pointer in an edit script * @edit: The edit script to modify. * @object: The object pointer to set. * * Change the object to be inserted in an edit script. The object pointed to * by the old object is not freed. This must be done prior to applying the * script. */ void assoc_array_insert_set_object(struct assoc_array_edit *edit, void *object) { BUG_ON(!object); edit->leaf = assoc_array_leaf_to_ptr(object); } struct assoc_array_delete_collapse_context { struct assoc_array_node *node; const void *skip_leaf; int slot; }; /* * Subtree collapse to node iterator. */ static int assoc_array_delete_collapse_iterator(const void *leaf, void *iterator_data) { struct assoc_array_delete_collapse_context *collapse = iterator_data; if (leaf == collapse->skip_leaf) return 0; BUG_ON(collapse->slot >= ASSOC_ARRAY_FAN_OUT); collapse->node->slots[collapse->slot++] = assoc_array_leaf_to_ptr(leaf); return 0; } /** * assoc_array_delete - Script deletion of an object from an associative array * @array: The array to search. * @ops: The operations to use. * @index_key: The key to the object. * * Precalculate and preallocate a script for the deletion of an object from an * associative array. This results in an edit script that can either be * applied or cancelled. * * The function returns a pointer to an edit script if the object was found, * NULL if the object was not found or -ENOMEM. * * The caller should lock against other modifications and must continue to hold * the lock until assoc_array_apply_edit() has been called. * * Accesses to the tree may take place concurrently with this function, * provided they hold the RCU read lock. */ struct assoc_array_edit *assoc_array_delete(struct assoc_array *array, const struct assoc_array_ops *ops, const void *index_key) { struct assoc_array_delete_collapse_context collapse; struct assoc_array_walk_result result; struct assoc_array_node *node, *new_n0; struct assoc_array_edit *edit; struct assoc_array_ptr *ptr; bool has_meta; int slot, i; pr_devel("-->%s()\n", __func__); edit = kzalloc(sizeof(struct assoc_array_edit), GFP_KERNEL); if (!edit) return ERR_PTR(-ENOMEM); edit->array = array; edit->ops = ops; edit->adjust_count_by = -1; switch (assoc_array_walk(array, ops, index_key, &result)) { case assoc_array_walk_found_terminal_node: /* We found a node that should contain the leaf we've been * asked to remove - *if* it's in the tree. */ pr_devel("terminal_node\n"); node = result.terminal_node.node; for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) { ptr = node->slots[slot]; if (ptr && assoc_array_ptr_is_leaf(ptr) && ops->compare_object(assoc_array_ptr_to_leaf(ptr), index_key)) goto found_leaf; } fallthrough; case assoc_array_walk_tree_empty: case assoc_array_walk_found_wrong_shortcut: default: assoc_array_cancel_edit(edit); pr_devel("not found\n"); return NULL; } found_leaf: BUG_ON(array->nr_leaves_on_tree <= 0); /* In the simplest form of deletion we just clear the slot and release * the leaf after a suitable interval. */ edit->dead_leaf = node->slots[slot]; edit->set[0].ptr = &node->slots[slot]; edit->set[0].to = NULL; edit->adjust_count_on = node; /* If that concludes erasure of the last leaf, then delete the entire * internal array. */ if (array->nr_leaves_on_tree == 1) { edit->set[1].ptr = &array->root; edit->set[1].to = NULL; edit->adjust_count_on = NULL; edit->excised_subtree = array->root; pr_devel("all gone\n"); return edit; } /* However, we'd also like to clear up some metadata blocks if we * possibly can. * * We go for a simple algorithm of: if this node has FAN_OUT or fewer * leaves in it, then attempt to collapse it - and attempt to * recursively collapse up the tree. * * We could also try and collapse in partially filled subtrees to take * up space in this node. */ if (node->nr_leaves_on_branch <= ASSOC_ARRAY_FAN_OUT + 1) { struct assoc_array_node *parent, *grandparent; struct assoc_array_ptr *ptr; /* First of all, we need to know if this node has metadata so * that we don't try collapsing if all the leaves are already * here. */ has_meta = false; for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { ptr = node->slots[i]; if (assoc_array_ptr_is_meta(ptr)) { has_meta = true; break; } } pr_devel("leaves: %ld [m=%d]\n", node->nr_leaves_on_branch - 1, has_meta); /* Look further up the tree to see if we can collapse this node * into a more proximal node too. */ parent = node; collapse_up: pr_devel("collapse subtree: %ld\n", parent->nr_leaves_on_branch); ptr = parent->back_pointer; if (!ptr) goto do_collapse; if (assoc_array_ptr_is_shortcut(ptr)) { struct assoc_array_shortcut *s = assoc_array_ptr_to_shortcut(ptr); ptr = s->back_pointer; if (!ptr) goto do_collapse; } grandparent = assoc_array_ptr_to_node(ptr); if (grandparent->nr_leaves_on_branch <= ASSOC_ARRAY_FAN_OUT + 1) { parent = grandparent; goto collapse_up; } do_collapse: /* There's no point collapsing if the original node has no meta * pointers to discard and if we didn't merge into one of that * node's ancestry. */ if (has_meta || parent != node) { node = parent; /* Create a new node to collapse into */ new_n0 = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL); if (!new_n0) goto enomem; edit->new_meta[0] = assoc_array_node_to_ptr(new_n0); new_n0->back_pointer = node->back_pointer; new_n0->parent_slot = node->parent_slot; new_n0->nr_leaves_on_branch = node->nr_leaves_on_branch; edit->adjust_count_on = new_n0; collapse.node = new_n0; collapse.skip_leaf = assoc_array_ptr_to_leaf(edit->dead_leaf); collapse.slot = 0; assoc_array_subtree_iterate(assoc_array_node_to_ptr(node), node->back_pointer, assoc_array_delete_collapse_iterator, &collapse); pr_devel("collapsed %d,%lu\n", collapse.slot, new_n0->nr_leaves_on_branch); BUG_ON(collapse.slot != new_n0->nr_leaves_on_branch - 1); if (!node->back_pointer) { edit->set[1].ptr = &array->root; } else if (assoc_array_ptr_is_leaf(node->back_pointer)) { BUG(); } else if (assoc_array_ptr_is_node(node->back_pointer)) { struct assoc_array_node *p = assoc_array_ptr_to_node(node->back_pointer); edit->set[1].ptr = &p->slots[node->parent_slot]; } else if (assoc_array_ptr_is_shortcut(node->back_pointer)) { struct assoc_array_shortcut *s = assoc_array_ptr_to_shortcut(node->back_pointer); edit->set[1].ptr = &s->next_node; } edit->set[1].to = assoc_array_node_to_ptr(new_n0); edit->excised_subtree = assoc_array_node_to_ptr(node); } } return edit; enomem: /* Clean up after an out of memory error */ pr_devel("enomem\n"); assoc_array_cancel_edit(edit); return ERR_PTR(-ENOMEM); } /** * assoc_array_clear - Script deletion of all objects from an associative array * @array: The array to clear. * @ops: The operations to use. * * Precalculate and preallocate a script for the deletion of all the objects * from an associative array. This results in an edit script that can either * be applied or cancelled. * * The function returns a pointer to an edit script if there are objects to be * deleted, NULL if there are no objects in the array or -ENOMEM. * * The caller should lock against other modifications and must continue to hold * the lock until assoc_array_apply_edit() has been called. * * Accesses to the tree may take place concurrently with this function, * provided they hold the RCU read lock. */ struct assoc_array_edit *assoc_array_clear(struct assoc_array *array, const struct assoc_array_ops *ops) { struct assoc_array_edit *edit; pr_devel("-->%s()\n", __func__); if (!array->root) return NULL; edit = kzalloc(sizeof(struct assoc_array_edit), GFP_KERNEL); if (!edit) return ERR_PTR(-ENOMEM); edit->array = array; edit->ops = ops; edit->set[1].ptr = &array->root; edit->set[1].to = NULL; edit->excised_subtree = array->root; edit->ops_for_excised_subtree = ops; pr_devel("all gone\n"); return edit; } /* * Handle the deferred destruction after an applied edit. */ static void assoc_array_rcu_cleanup(struct rcu_head *head) { struct assoc_array_edit *edit = container_of(head, struct assoc_array_edit, rcu); int i; pr_devel("-->%s()\n", __func__); if (edit->dead_leaf) edit->ops->free_object(assoc_array_ptr_to_leaf(edit->dead_leaf)); for (i = 0; i < ARRAY_SIZE(edit->excised_meta); i++) if (edit->excised_meta[i]) kfree(assoc_array_ptr_to_node(edit->excised_meta[i])); if (edit->excised_subtree) { BUG_ON(assoc_array_ptr_is_leaf(edit->excised_subtree)); if (assoc_array_ptr_is_node(edit->excised_subtree)) { struct assoc_array_node *n = assoc_array_ptr_to_node(edit->excised_subtree); n->back_pointer = NULL; } else { struct assoc_array_shortcut *s = assoc_array_ptr_to_shortcut(edit->excised_subtree); s->back_pointer = NULL; } assoc_array_destroy_subtree(edit->excised_subtree, edit->ops_for_excised_subtree); } kfree(edit); } /** * assoc_array_apply_edit - Apply an edit script to an associative array * @edit: The script to apply. * * Apply an edit script to an associative array to effect an insertion, * deletion or clearance. As the edit script includes preallocated memory, * this is guaranteed not to fail. * * The edit script, dead objects and dead metadata will be scheduled for * destruction after an RCU grace period to permit those doing read-only * accesses on the array to continue to do so under the RCU read lock whilst * the edit is taking place. */ void assoc_array_apply_edit(struct assoc_array_edit *edit) { struct assoc_array_shortcut *shortcut; struct assoc_array_node *node; struct assoc_array_ptr *ptr; int i; pr_devel("-->%s()\n", __func__); smp_wmb(); if (edit->leaf_p) *edit->leaf_p = edit->leaf; smp_wmb(); for (i = 0; i < ARRAY_SIZE(edit->set_parent_slot); i++) if (edit->set_parent_slot[i].p) *edit->set_parent_slot[i].p = edit->set_parent_slot[i].to; smp_wmb(); for (i = 0; i < ARRAY_SIZE(edit->set_backpointers); i++) if (edit->set_backpointers[i]) *edit->set_backpointers[i] = edit->set_backpointers_to; smp_wmb(); for (i = 0; i < ARRAY_SIZE(edit->set); i++) if (edit->set[i].ptr) *edit->set[i].ptr = edit->set[i].to; if (edit->array->root == NULL) { edit->array->nr_leaves_on_tree = 0; } else if (edit->adjust_count_on) { node = edit->adjust_count_on; for (;;) { node->nr_leaves_on_branch += edit->adjust_count_by; ptr = node->back_pointer; if (!ptr) break; if (assoc_array_ptr_is_shortcut(ptr)) { shortcut = assoc_array_ptr_to_shortcut(ptr); ptr = shortcut->back_pointer; if (!ptr) break; } BUG_ON(!assoc_array_ptr_is_node(ptr)); node = assoc_array_ptr_to_node(ptr); } edit->array->nr_leaves_on_tree += edit->adjust_count_by; } call_rcu(&edit->rcu, assoc_array_rcu_cleanup); } /** * assoc_array_cancel_edit - Discard an edit script. * @edit: The script to discard. * * Free an edit script and all the preallocated data it holds without making * any changes to the associative array it was intended for. * * NOTE! In the case of an insertion script, this does _not_ release the leaf * that was to be inserted. That is left to the caller. */ void assoc_array_cancel_edit(struct assoc_array_edit *edit) { struct assoc_array_ptr *ptr; int i; pr_devel("-->%s()\n", __func__); /* Clean up after an out of memory error */ for (i = 0; i < ARRAY_SIZE(edit->new_meta); i++) { ptr = edit->new_meta[i]; if (ptr) { if (assoc_array_ptr_is_node(ptr)) kfree(assoc_array_ptr_to_node(ptr)); else kfree(assoc_array_ptr_to_shortcut(ptr)); } } kfree(edit); } /** * assoc_array_gc - Garbage collect an associative array. * @array: The array to clean. * @ops: The operations to use. * @iterator: A callback function to pass judgement on each object. * @iterator_data: Private data for the callback function. * * Collect garbage from an associative array and pack down the internal tree to * save memory. * * The iterator function is asked to pass judgement upon each object in the * array. If it returns false, the object is discard and if it returns true, * the object is kept. If it returns true, it must increment the object's * usage count (or whatever it needs to do to retain it) before returning. * * This function returns 0 if successful or -ENOMEM if out of memory. In the * latter case, the array is not changed. * * The caller should lock against other modifications and must continue to hold * the lock until assoc_array_apply_edit() has been called. * * Accesses to the tree may take place concurrently with this function, * provided they hold the RCU read lock. */ int assoc_array_gc(struct assoc_array *array, const struct assoc_array_ops *ops, bool (*iterator)(void *object, void *iterator_data), void *iterator_data) { struct assoc_array_shortcut *shortcut, *new_s; struct assoc_array_node *node, *new_n; struct assoc_array_edit *edit; struct assoc_array_ptr *cursor, *ptr; struct assoc_array_ptr *new_root, *new_parent, **new_ptr_pp; unsigned long nr_leaves_on_tree; bool retained; int keylen, slot, nr_free, next_slot, i; pr_devel("-->%s()\n", __func__); if (!array->root) return 0; edit = kzalloc(sizeof(struct assoc_array_edit), GFP_KERNEL); if (!edit) return -ENOMEM; edit->array = array; edit->ops = ops; edit->ops_for_excised_subtree = ops; edit->set[0].ptr = &array->root; edit->excised_subtree = array->root; new_root = new_parent = NULL; new_ptr_pp = &new_root; cursor = array->root; descend: /* If this point is a shortcut, then we need to duplicate it and * advance the target cursor. */ if (assoc_array_ptr_is_shortcut(cursor)) { shortcut = assoc_array_ptr_to_shortcut(cursor); keylen = round_up(shortcut->skip_to_level, ASSOC_ARRAY_KEY_CHUNK_SIZE); keylen >>= ASSOC_ARRAY_KEY_CHUNK_SHIFT; new_s = kmalloc(struct_size(new_s, index_key, keylen), GFP_KERNEL); if (!new_s) goto enomem; pr_devel("dup shortcut %p -> %p\n", shortcut, new_s); memcpy(new_s, shortcut, struct_size(new_s, index_key, keylen)); new_s->back_pointer = new_parent; new_s->parent_slot = shortcut->parent_slot; *new_ptr_pp = new_parent = assoc_array_shortcut_to_ptr(new_s); new_ptr_pp = &new_s->next_node; cursor = shortcut->next_node; } /* Duplicate the node at this position */ node = assoc_array_ptr_to_node(cursor); new_n = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL); if (!new_n) goto enomem; pr_devel("dup node %p -> %p\n", node, new_n); new_n->back_pointer = new_parent; new_n->parent_slot = node->parent_slot; *new_ptr_pp = new_parent = assoc_array_node_to_ptr(new_n); new_ptr_pp = NULL; slot = 0; continue_node: /* Filter across any leaves and gc any subtrees */ for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) { ptr = node->slots[slot]; if (!ptr) continue; if (assoc_array_ptr_is_leaf(ptr)) { if (iterator(assoc_array_ptr_to_leaf(ptr), iterator_data)) /* The iterator will have done any reference * counting on the object for us. */ new_n->slots[slot] = ptr; continue; } new_ptr_pp = &new_n->slots[slot]; cursor = ptr; goto descend; } retry_compress: pr_devel("-- compress node %p --\n", new_n); /* Count up the number of empty slots in this node and work out the * subtree leaf count. */ new_n->nr_leaves_on_branch = 0; nr_free = 0; for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) { ptr = new_n->slots[slot]; if (!ptr) nr_free++; else if (assoc_array_ptr_is_leaf(ptr)) new_n->nr_leaves_on_branch++; } pr_devel("free=%d, leaves=%lu\n", nr_free, new_n->nr_leaves_on_branch); /* See what we can fold in */ retained = false; next_slot = 0; for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) { struct assoc_array_shortcut *s; struct assoc_array_node *child; ptr = new_n->slots[slot]; if (!ptr || assoc_array_ptr_is_leaf(ptr)) continue; s = NULL; if (assoc_array_ptr_is_shortcut(ptr)) { s = assoc_array_ptr_to_shortcut(ptr); ptr = s->next_node; } child = assoc_array_ptr_to_node(ptr); new_n->nr_leaves_on_branch += child->nr_leaves_on_branch; if (child->nr_leaves_on_branch <= nr_free + 1) { /* Fold the child node into this one */ pr_devel("[%d] fold node %lu/%d [nx %d]\n", slot, child->nr_leaves_on_branch, nr_free + 1, next_slot); /* We would already have reaped an intervening shortcut * on the way back up the tree. */ BUG_ON(s); new_n->slots[slot] = NULL; nr_free++; if (slot < next_slot) next_slot = slot; for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { struct assoc_array_ptr *p = child->slots[i]; if (!p) continue; BUG_ON(assoc_array_ptr_is_meta(p)); while (new_n->slots[next_slot]) next_slot++; BUG_ON(next_slot >= ASSOC_ARRAY_FAN_OUT); new_n->slots[next_slot++] = p; nr_free--; } kfree(child); } else { pr_devel("[%d] retain node %lu/%d [nx %d]\n", slot, child->nr_leaves_on_branch, nr_free + 1, next_slot); retained = true; } } if (retained && new_n->nr_leaves_on_branch <= ASSOC_ARRAY_FAN_OUT) { pr_devel("internal nodes remain despite enough space, retrying\n"); goto retry_compress; } pr_devel("after: %lu\n", new_n->nr_leaves_on_branch); nr_leaves_on_tree = new_n->nr_leaves_on_branch; /* Excise this node if it is singly occupied by a shortcut */ if (nr_free == ASSOC_ARRAY_FAN_OUT - 1) { for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) if ((ptr = new_n->slots[slot])) break; if (assoc_array_ptr_is_meta(ptr) && assoc_array_ptr_is_shortcut(ptr)) { pr_devel("excise node %p with 1 shortcut\n", new_n); new_s = assoc_array_ptr_to_shortcut(ptr); new_parent = new_n->back_pointer; slot = new_n->parent_slot; kfree(new_n); if (!new_parent) { new_s->back_pointer = NULL; new_s->parent_slot = 0; new_root = ptr; goto gc_complete; } if (assoc_array_ptr_is_shortcut(new_parent)) { /* We can discard any preceding shortcut also */ struct assoc_array_shortcut *s = assoc_array_ptr_to_shortcut(new_parent); pr_devel("excise preceding shortcut\n"); new_parent = new_s->back_pointer = s->back_pointer; slot = new_s->parent_slot = s->parent_slot; kfree(s); if (!new_parent) { new_s->back_pointer = NULL; new_s->parent_slot = 0; new_root = ptr; goto gc_complete; } } new_s->back_pointer = new_parent; new_s->parent_slot = slot; new_n = assoc_array_ptr_to_node(new_parent); new_n->slots[slot] = ptr; goto ascend_old_tree; } } /* Excise any shortcuts we might encounter that point to nodes that * only contain leaves. */ ptr = new_n->back_pointer; if (!ptr) goto gc_complete; if (assoc_array_ptr_is_shortcut(ptr)) { new_s = assoc_array_ptr_to_shortcut(ptr); new_parent = new_s->back_pointer; slot = new_s->parent_slot; if (new_n->nr_leaves_on_branch <= ASSOC_ARRAY_FAN_OUT) { struct assoc_array_node *n; pr_devel("excise shortcut\n"); new_n->back_pointer = new_parent; new_n->parent_slot = slot; kfree(new_s); if (!new_parent) { new_root = assoc_array_node_to_ptr(new_n); goto gc_complete; } n = assoc_array_ptr_to_node(new_parent); n->slots[slot] = assoc_array_node_to_ptr(new_n); } } else { new_parent = ptr; } new_n = assoc_array_ptr_to_node(new_parent); ascend_old_tree: ptr = node->back_pointer; if (assoc_array_ptr_is_shortcut(ptr)) { shortcut = assoc_array_ptr_to_shortcut(ptr); slot = shortcut->parent_slot; cursor = shortcut->back_pointer; if (!cursor) goto gc_complete; } else { slot = node->parent_slot; cursor = ptr; } BUG_ON(!cursor); node = assoc_array_ptr_to_node(cursor); slot++; goto continue_node; gc_complete: edit->set[0].to = new_root; assoc_array_apply_edit(edit); array->nr_leaves_on_tree = nr_leaves_on_tree; return 0; enomem: pr_devel("enomem\n"); assoc_array_destroy_subtree(new_root, edit->ops); kfree(edit); return -ENOMEM; } |
131 131 134 134 134 134 134 134 25 25 209 115 149 63 63 63 149 149 149 149 149 149 115 115 149 57 115 115 115 115 115 9 222 60 60 231 231 231 231 231 231 231 231 231 231 231 149 141 149 115 149 15 122 122 120 231 231 231 231 231 231 231 60 60 60 259 259 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 | // SPDX-License-Identifier: GPL-2.0 /* Multipath TCP * * Copyright (c) 2017 - 2019, Intel Corporation. */ #define pr_fmt(fmt) "MPTCP: " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/netdevice.h> #include <crypto/algapi.h> #include <crypto/sha2.h> #include <net/sock.h> #include <net/inet_common.h> #include <net/inet_hashtables.h> #include <net/protocol.h> #include <net/tcp.h> #if IS_ENABLED(CONFIG_MPTCP_IPV6) #include <net/ip6_route.h> #include <net/transp_v6.h> #endif #include <net/mptcp.h> #include <uapi/linux/mptcp.h> #include "protocol.h" #include "mib.h" #include <trace/events/mptcp.h> #include <trace/events/sock.h> static void mptcp_subflow_ops_undo_override(struct sock *ssk); static void SUBFLOW_REQ_INC_STATS(struct request_sock *req, enum linux_mptcp_mib_field field) { MPTCP_INC_STATS(sock_net(req_to_sk(req)), field); } static void subflow_req_destructor(struct request_sock *req) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); pr_debug("subflow_req=%p", subflow_req); if (subflow_req->msk) sock_put((struct sock *)subflow_req->msk); mptcp_token_destroy_request(req); } static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2, void *hmac) { u8 msg[8]; put_unaligned_be32(nonce1, &msg[0]); put_unaligned_be32(nonce2, &msg[4]); mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac); } static bool mptcp_can_accept_new_subflow(const struct mptcp_sock *msk) { return mptcp_is_fully_established((void *)msk) && ((mptcp_pm_is_userspace(msk) && mptcp_userspace_pm_active(msk)) || READ_ONCE(msk->pm.accept_subflow)); } /* validate received token and create truncated hmac and nonce for SYN-ACK */ static void subflow_req_create_thmac(struct mptcp_subflow_request_sock *subflow_req) { struct mptcp_sock *msk = subflow_req->msk; u8 hmac[SHA256_DIGEST_SIZE]; get_random_bytes(&subflow_req->local_nonce, sizeof(u32)); subflow_generate_hmac(msk->local_key, msk->remote_key, subflow_req->local_nonce, subflow_req->remote_nonce, hmac); subflow_req->thmac = get_unaligned_be64(hmac); } static struct mptcp_sock *subflow_token_join_request(struct request_sock *req) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct mptcp_sock *msk; int local_id; msk = mptcp_token_get_sock(sock_net(req_to_sk(req)), subflow_req->token); if (!msk) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINNOTOKEN); return NULL; } local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)req); if (local_id < 0) { sock_put((struct sock *)msk); return NULL; } subflow_req->local_id = local_id; return msk; } static void subflow_init_req(struct request_sock *req, const struct sock *sk_listener) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); subflow_req->mp_capable = 0; subflow_req->mp_join = 0; subflow_req->csum_reqd = mptcp_is_checksum_enabled(sock_net(sk_listener)); subflow_req->allow_join_id0 = mptcp_allow_join_id0(sock_net(sk_listener)); subflow_req->msk = NULL; mptcp_token_init_request(req); } static bool subflow_use_different_sport(struct mptcp_sock *msk, const struct sock *sk) { return inet_sk(sk)->inet_sport != inet_sk((struct sock *)msk)->inet_sport; } static void subflow_add_reset_reason(struct sk_buff *skb, u8 reason) { struct mptcp_ext *mpext = skb_ext_add(skb, SKB_EXT_MPTCP); if (mpext) { memset(mpext, 0, sizeof(*mpext)); mpext->reset_reason = reason; } } /* Init mptcp request socket. * * Returns an error code if a JOIN has failed and a TCP reset * should be sent. */ static int subflow_check_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb) { struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct mptcp_options_received mp_opt; bool opt_mp_capable, opt_mp_join; pr_debug("subflow_req=%p, listener=%p", subflow_req, listener); #ifdef CONFIG_TCP_MD5SIG /* no MPTCP if MD5SIG is enabled on this socket or we may run out of * TCP option space. */ if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info)) return -EINVAL; #endif mptcp_get_options(skb, &mp_opt); opt_mp_capable = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPC); opt_mp_join = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ); if (opt_mp_capable) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE); if (opt_mp_join) return 0; } else if (opt_mp_join) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX); } if (opt_mp_capable && listener->request_mptcp) { int err, retries = MPTCP_TOKEN_MAX_RETRIES; subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; again: do { get_random_bytes(&subflow_req->local_key, sizeof(subflow_req->local_key)); } while (subflow_req->local_key == 0); if (unlikely(req->syncookie)) { mptcp_crypto_key_sha(subflow_req->local_key, &subflow_req->token, &subflow_req->idsn); if (mptcp_token_exists(subflow_req->token)) { if (retries-- > 0) goto again; SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_TOKENFALLBACKINIT); } else { subflow_req->mp_capable = 1; } return 0; } err = mptcp_token_new_request(req); if (err == 0) subflow_req->mp_capable = 1; else if (retries-- > 0) goto again; else SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_TOKENFALLBACKINIT); } else if (opt_mp_join && listener->request_mptcp) { subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; subflow_req->mp_join = 1; subflow_req->backup = mp_opt.backup; subflow_req->remote_id = mp_opt.join_id; subflow_req->token = mp_opt.token; subflow_req->remote_nonce = mp_opt.nonce; subflow_req->msk = subflow_token_join_request(req); /* Can't fall back to TCP in this case. */ if (!subflow_req->msk) { subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP); return -EPERM; } if (subflow_use_different_sport(subflow_req->msk, sk_listener)) { pr_debug("syn inet_sport=%d %d", ntohs(inet_sk(sk_listener)->inet_sport), ntohs(inet_sk((struct sock *)subflow_req->msk)->inet_sport)); if (!mptcp_pm_sport_in_anno_list(subflow_req->msk, sk_listener)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTSYNRX); return -EPERM; } SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTSYNRX); } subflow_req_create_thmac(subflow_req); if (unlikely(req->syncookie)) { if (mptcp_can_accept_new_subflow(subflow_req->msk)) subflow_init_req_cookie_join_save(subflow_req, skb); else return -EPERM; } pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token, subflow_req->remote_nonce, subflow_req->msk); } return 0; } int mptcp_subflow_init_cookie_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb) { struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct mptcp_options_received mp_opt; bool opt_mp_capable, opt_mp_join; int err; subflow_init_req(req, sk_listener); mptcp_get_options(skb, &mp_opt); opt_mp_capable = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPC); opt_mp_join = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ); if (opt_mp_capable && opt_mp_join) return -EINVAL; if (opt_mp_capable && listener->request_mptcp) { if (mp_opt.sndr_key == 0) return -EINVAL; subflow_req->local_key = mp_opt.rcvr_key; err = mptcp_token_new_request(req); if (err) return err; subflow_req->mp_capable = 1; subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1; } else if (opt_mp_join && listener->request_mptcp) { if (!mptcp_token_join_cookie_init_state(subflow_req, skb)) return -EINVAL; subflow_req->mp_join = 1; subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1; } return 0; } EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req); static struct dst_entry *subflow_v4_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, struct request_sock *req) { struct dst_entry *dst; int err; tcp_rsk(req)->is_mptcp = 1; subflow_init_req(req, sk); dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req); if (!dst) return NULL; err = subflow_check_req(req, sk, skb); if (err == 0) return dst; dst_release(dst); if (!req->syncookie) tcp_request_sock_ops.send_reset(sk, skb); return NULL; } static void subflow_prep_synack(const struct sock *sk, struct request_sock *req, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct inet_request_sock *ireq = inet_rsk(req); /* clear tstamp_ok, as needed depending on cookie */ if (foc && foc->len > -1) ireq->tstamp_ok = 0; if (synack_type == TCP_SYNACK_FASTOPEN) mptcp_fastopen_subflow_synack_set_params(subflow, req); } static int subflow_v4_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type, struct sk_buff *syn_skb) { subflow_prep_synack(sk, req, foc, synack_type); return tcp_request_sock_ipv4_ops.send_synack(sk, dst, fl, req, foc, synack_type, syn_skb); } #if IS_ENABLED(CONFIG_MPTCP_IPV6) static int subflow_v6_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type, struct sk_buff *syn_skb) { subflow_prep_synack(sk, req, foc, synack_type); return tcp_request_sock_ipv6_ops.send_synack(sk, dst, fl, req, foc, synack_type, syn_skb); } static struct dst_entry *subflow_v6_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, struct request_sock *req) { struct dst_entry *dst; int err; tcp_rsk(req)->is_mptcp = 1; subflow_init_req(req, sk); dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req); if (!dst) return NULL; err = subflow_check_req(req, sk, skb); if (err == 0) return dst; dst_release(dst); if (!req->syncookie) tcp6_request_sock_ops.send_reset(sk, skb); return NULL; } #endif /* validate received truncated hmac and create hmac for third ACK */ static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow) { u8 hmac[SHA256_DIGEST_SIZE]; u64 thmac; subflow_generate_hmac(subflow->remote_key, subflow->local_key, subflow->remote_nonce, subflow->local_nonce, hmac); thmac = get_unaligned_be64(hmac); pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n", subflow, subflow->token, thmac, subflow->thmac); return thmac == subflow->thmac; } void mptcp_subflow_reset(struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = subflow->conn; /* mptcp_mp_fail_no_response() can reach here on an already closed * socket */ if (ssk->sk_state == TCP_CLOSE) return; /* must hold: tcp_done() could drop last reference on parent */ sock_hold(sk); tcp_send_active_reset(ssk, GFP_ATOMIC); tcp_done(ssk); if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags)) mptcp_schedule_work(sk); sock_put(sk); } static bool subflow_use_different_dport(struct mptcp_sock *msk, const struct sock *sk) { return inet_sk(sk)->inet_dport != inet_sk((struct sock *)msk)->inet_dport; } void __mptcp_set_connected(struct sock *sk) { if (sk->sk_state == TCP_SYN_SENT) { inet_sk_state_store(sk, TCP_ESTABLISHED); sk->sk_state_change(sk); } } static void mptcp_set_connected(struct sock *sk) { mptcp_data_lock(sk); if (!sock_owned_by_user(sk)) __mptcp_set_connected(sk); else __set_bit(MPTCP_CONNECTED, &mptcp_sk(sk)->cb_flags); mptcp_data_unlock(sk); } static void subflow_set_remote_key(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, const struct mptcp_options_received *mp_opt) { /* active MPC subflow will reach here multiple times: * at subflow_finish_connect() time and at 4th ack time */ if (subflow->remote_key_valid) return; subflow->remote_key_valid = 1; subflow->remote_key = mp_opt->sndr_key; mptcp_crypto_key_sha(subflow->remote_key, NULL, &subflow->iasn); subflow->iasn++; WRITE_ONCE(msk->remote_key, subflow->remote_key); WRITE_ONCE(msk->ack_seq, subflow->iasn); WRITE_ONCE(msk->can_ack, true); atomic64_set(&msk->rcv_wnd_sent, subflow->iasn); } static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_options_received mp_opt; struct sock *parent = subflow->conn; struct mptcp_sock *msk; subflow->icsk_af_ops->sk_rx_dst_set(sk, skb); /* be sure no special action on any packet other than syn-ack */ if (subflow->conn_finished) return; msk = mptcp_sk(parent); mptcp_propagate_sndbuf(parent, sk); subflow->rel_write_seq = 1; subflow->conn_finished = 1; subflow->ssn_offset = TCP_SKB_CB(skb)->seq; pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset); mptcp_get_options(skb, &mp_opt); if (subflow->request_mptcp) { if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPC)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEFALLBACK); mptcp_do_fallback(sk); pr_fallback(msk); goto fallback; } if (mp_opt.suboptions & OPTION_MPTCP_CSUMREQD) WRITE_ONCE(msk->csum_enabled, true); if (mp_opt.deny_join_id0) WRITE_ONCE(msk->pm.remote_deny_join_id0, true); subflow->mp_capable = 1; subflow_set_remote_key(msk, subflow, &mp_opt); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEACK); mptcp_finish_connect(sk); mptcp_set_connected(parent); } else if (subflow->request_join) { u8 hmac[SHA256_DIGEST_SIZE]; if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ)) { subflow->reset_reason = MPTCP_RST_EMPTCP; goto do_reset; } subflow->backup = mp_opt.backup; subflow->thmac = mp_opt.thmac; subflow->remote_nonce = mp_opt.nonce; subflow->remote_id = mp_opt.join_id; pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u backup=%d", subflow, subflow->thmac, subflow->remote_nonce, subflow->backup); if (!subflow_thmac_valid(subflow)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC); subflow->reset_reason = MPTCP_RST_EMPTCP; goto do_reset; } if (!mptcp_finish_join(sk)) goto do_reset; subflow_generate_hmac(subflow->local_key, subflow->remote_key, subflow->local_nonce, subflow->remote_nonce, hmac); memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN); subflow->mp_join = 1; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX); if (subflow_use_different_dport(msk, sk)) { pr_debug("synack inet_dport=%d %d", ntohs(inet_sk(sk)->inet_dport), ntohs(inet_sk(parent)->inet_dport)); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINPORTSYNACKRX); } } else if (mptcp_check_fallback(sk)) { fallback: mptcp_rcv_space_init(msk, sk); mptcp_set_connected(parent); } return; do_reset: subflow->reset_transient = 0; mptcp_subflow_reset(sk); } static void subflow_set_local_id(struct mptcp_subflow_context *subflow, int local_id) { subflow->local_id = local_id; subflow->local_id_valid = 1; } static int subflow_chk_local_id(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); int err; if (likely(subflow->local_id_valid)) return 0; err = mptcp_pm_get_local_id(msk, (struct sock_common *)sk); if (err < 0) return err; subflow_set_local_id(subflow, err); return 0; } static int subflow_rebuild_header(struct sock *sk) { int err = subflow_chk_local_id(sk); if (unlikely(err < 0)) return err; return inet_sk_rebuild_header(sk); } #if IS_ENABLED(CONFIG_MPTCP_IPV6) static int subflow_v6_rebuild_header(struct sock *sk) { int err = subflow_chk_local_id(sk); if (unlikely(err < 0)) return err; return inet6_sk_rebuild_header(sk); } #endif static struct request_sock_ops mptcp_subflow_v4_request_sock_ops __ro_after_init; static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops __ro_after_init; static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); pr_debug("subflow=%p", subflow); /* Never answer to SYNs sent to broadcast or multicast */ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) goto drop; return tcp_conn_request(&mptcp_subflow_v4_request_sock_ops, &subflow_request_sock_ipv4_ops, sk, skb); drop: tcp_listendrop(sk); return 0; } static void subflow_v4_req_destructor(struct request_sock *req) { subflow_req_destructor(req); tcp_request_sock_ops.destructor(req); } #if IS_ENABLED(CONFIG_MPTCP_IPV6) static struct request_sock_ops mptcp_subflow_v6_request_sock_ops __ro_after_init; static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops __ro_after_init; static struct inet_connection_sock_af_ops subflow_v6_specific __ro_after_init; static struct inet_connection_sock_af_ops subflow_v6m_specific __ro_after_init; static struct proto tcpv6_prot_override __ro_after_init; static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); pr_debug("subflow=%p", subflow); if (skb->protocol == htons(ETH_P_IP)) return subflow_v4_conn_request(sk, skb); if (!ipv6_unicast_destination(skb)) goto drop; if (ipv6_addr_v4mapped(&ipv6_hdr(skb)->saddr)) { __IP6_INC_STATS(sock_net(sk), NULL, IPSTATS_MIB_INHDRERRORS); return 0; } return tcp_conn_request(&mptcp_subflow_v6_request_sock_ops, &subflow_request_sock_ipv6_ops, sk, skb); drop: tcp_listendrop(sk); return 0; /* don't send reset */ } static void subflow_v6_req_destructor(struct request_sock *req) { subflow_req_destructor(req); tcp6_request_sock_ops.destructor(req); } #endif struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener, bool attach_listener) { if (ops->family == AF_INET) ops = &mptcp_subflow_v4_request_sock_ops; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (ops->family == AF_INET6) ops = &mptcp_subflow_v6_request_sock_ops; #endif return inet_reqsk_alloc(ops, sk_listener, attach_listener); } EXPORT_SYMBOL(mptcp_subflow_reqsk_alloc); /* validate hmac received in third ACK */ static bool subflow_hmac_valid(const struct request_sock *req, const struct mptcp_options_received *mp_opt) { const struct mptcp_subflow_request_sock *subflow_req; u8 hmac[SHA256_DIGEST_SIZE]; struct mptcp_sock *msk; subflow_req = mptcp_subflow_rsk(req); msk = subflow_req->msk; if (!msk) return false; subflow_generate_hmac(msk->remote_key, msk->local_key, subflow_req->remote_nonce, subflow_req->local_nonce, hmac); return !crypto_memneq(hmac, mp_opt->hmac, MPTCPOPT_HMAC_LEN); } static void subflow_ulp_fallback(struct sock *sk, struct mptcp_subflow_context *old_ctx) { struct inet_connection_sock *icsk = inet_csk(sk); mptcp_subflow_tcp_fallback(sk, old_ctx); icsk->icsk_ulp_ops = NULL; rcu_assign_pointer(icsk->icsk_ulp_data, NULL); tcp_sk(sk)->is_mptcp = 0; mptcp_subflow_ops_undo_override(sk); } void mptcp_subflow_drop_ctx(struct sock *ssk) { struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk); if (!ctx) return; list_del(&mptcp_subflow_ctx(ssk)->node); if (inet_csk(ssk)->icsk_ulp_ops) { subflow_ulp_fallback(ssk, ctx); if (ctx->conn) sock_put(ctx->conn); } kfree_rcu(ctx, rcu); } void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, const struct mptcp_options_received *mp_opt) { struct mptcp_sock *msk = mptcp_sk(subflow->conn); subflow_set_remote_key(msk, subflow, mp_opt); subflow->fully_established = 1; WRITE_ONCE(msk->fully_established, true); if (subflow->is_mptfo) mptcp_fastopen_gen_msk_ackseq(msk, subflow, mp_opt); } static struct sock *subflow_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst, struct request_sock *req_unhash, bool *own_req) { struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk); struct mptcp_subflow_request_sock *subflow_req; struct mptcp_options_received mp_opt; bool fallback, fallback_is_fatal; struct mptcp_sock *owner; struct sock *child; pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn); /* After child creation we must look for MPC even when options * are not parsed */ mp_opt.suboptions = 0; /* hopefully temporary handling for MP_JOIN+syncookie */ subflow_req = mptcp_subflow_rsk(req); fallback_is_fatal = tcp_rsk(req)->is_mptcp && subflow_req->mp_join; fallback = !tcp_rsk(req)->is_mptcp; if (fallback) goto create_child; /* if the sk is MP_CAPABLE, we try to fetch the client key */ if (subflow_req->mp_capable) { /* we can receive and accept an in-window, out-of-order pkt, * which may not carry the MP_CAPABLE opt even on mptcp enabled * paths: always try to extract the peer key, and fallback * for packets missing it. * Even OoO DSS packets coming legitly after dropped or * reordered MPC will cause fallback, but we don't have other * options. */ mptcp_get_options(skb, &mp_opt); if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPC)) fallback = true; } else if (subflow_req->mp_join) { mptcp_get_options(skb, &mp_opt); if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ) || !subflow_hmac_valid(req, &mp_opt) || !mptcp_can_accept_new_subflow(subflow_req->msk)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); fallback = true; } } create_child: child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst, req_unhash, own_req); if (child && *own_req) { struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child); tcp_rsk(req)->drop_req = false; /* we need to fallback on ctx allocation failure and on pre-reqs * checking above. In the latter scenario we additionally need * to reset the context to non MPTCP status. */ if (!ctx || fallback) { if (fallback_is_fatal) { subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP); goto dispose_child; } goto fallback; } /* ssk inherits options of listener sk */ ctx->setsockopt_seq = listener->setsockopt_seq; if (ctx->mp_capable) { ctx->conn = mptcp_sk_clone_init(listener->conn, &mp_opt, child, req); if (!ctx->conn) goto fallback; ctx->subflow_id = 1; owner = mptcp_sk(ctx->conn); mptcp_pm_new_connection(owner, child, 1); /* with OoO packets we can reach here without ingress * mpc option */ if (mp_opt.suboptions & OPTION_MPTCP_MPC_ACK) { mptcp_subflow_fully_established(ctx, &mp_opt); mptcp_pm_fully_established(owner, child); ctx->pm_notified = 1; } } else if (ctx->mp_join) { owner = subflow_req->msk; if (!owner) { subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); goto dispose_child; } /* move the msk reference ownership to the subflow */ subflow_req->msk = NULL; ctx->conn = (struct sock *)owner; if (subflow_use_different_sport(owner, sk)) { pr_debug("ack inet_sport=%d %d", ntohs(inet_sk(sk)->inet_sport), ntohs(inet_sk((struct sock *)owner)->inet_sport)); if (!mptcp_pm_sport_in_anno_list(owner, sk)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTACKRX); goto dispose_child; } SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTACKRX); } if (!mptcp_finish_join(child)) goto dispose_child; SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX); tcp_rsk(req)->drop_req = true; } } /* check for expected invariant - should never trigger, just help * catching eariler subtle bugs */ WARN_ON_ONCE(child && *own_req && tcp_sk(child)->is_mptcp && (!mptcp_subflow_ctx(child) || !mptcp_subflow_ctx(child)->conn)); return child; dispose_child: mptcp_subflow_drop_ctx(child); tcp_rsk(req)->drop_req = true; inet_csk_prepare_for_destroy_sock(child); tcp_done(child); req->rsk_ops->send_reset(sk, skb); /* The last child reference will be released by the caller */ return child; fallback: mptcp_subflow_drop_ctx(child); return child; } static struct inet_connection_sock_af_ops subflow_specific __ro_after_init; static struct proto tcp_prot_override __ro_after_init; enum mapping_status { MAPPING_OK, MAPPING_INVALID, MAPPING_EMPTY, MAPPING_DATA_FIN, MAPPING_DUMMY, MAPPING_BAD_CSUM }; static void dbg_bad_map(struct mptcp_subflow_context *subflow, u32 ssn) { pr_debug("Bad mapping: ssn=%d map_seq=%d map_data_len=%d", ssn, subflow->map_subflow_seq, subflow->map_data_len); } static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); unsigned int skb_consumed; skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq; if (WARN_ON_ONCE(skb_consumed >= skb->len)) return true; return skb->len - skb_consumed <= subflow->map_data_len - mptcp_subflow_get_map_offset(subflow); } static bool validate_mapping(struct sock *ssk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); u32 ssn = tcp_sk(ssk)->copied_seq - subflow->ssn_offset; if (unlikely(before(ssn, subflow->map_subflow_seq))) { /* Mapping covers data later in the subflow stream, * currently unsupported. */ dbg_bad_map(subflow, ssn); return false; } if (unlikely(!before(ssn, subflow->map_subflow_seq + subflow->map_data_len))) { /* Mapping does covers past subflow data, invalid */ dbg_bad_map(subflow, ssn); return false; } return true; } static enum mapping_status validate_data_csum(struct sock *ssk, struct sk_buff *skb, bool csum_reqd) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); u32 offset, seq, delta; __sum16 csum; int len; if (!csum_reqd) return MAPPING_OK; /* mapping already validated on previous traversal */ if (subflow->map_csum_len == subflow->map_data_len) return MAPPING_OK; /* traverse the receive queue, ensuring it contains a full * DSS mapping and accumulating the related csum. * Preserve the accoumlate csum across multiple calls, to compute * the csum only once */ delta = subflow->map_data_len - subflow->map_csum_len; for (;;) { seq = tcp_sk(ssk)->copied_seq + subflow->map_csum_len; offset = seq - TCP_SKB_CB(skb)->seq; /* if the current skb has not been accounted yet, csum its contents * up to the amount covered by the current DSS */ if (offset < skb->len) { __wsum csum; len = min(skb->len - offset, delta); csum = skb_checksum(skb, offset, len, 0); subflow->map_data_csum = csum_block_add(subflow->map_data_csum, csum, subflow->map_csum_len); delta -= len; subflow->map_csum_len += len; } if (delta == 0) break; if (skb_queue_is_last(&ssk->sk_receive_queue, skb)) { /* if this subflow is closed, the partial mapping * will be never completed; flush the pending skbs, so * that subflow_sched_work_if_closed() can kick in */ if (unlikely(ssk->sk_state == TCP_CLOSE)) while ((skb = skb_peek(&ssk->sk_receive_queue))) sk_eat_skb(ssk, skb); /* not enough data to validate the csum */ return MAPPING_EMPTY; } /* the DSS mapping for next skbs will be validated later, * when a get_mapping_status call will process such skb */ skb = skb->next; } /* note that 'map_data_len' accounts only for the carried data, does * not include the eventual seq increment due to the data fin, * while the pseudo header requires the original DSS data len, * including that */ csum = __mptcp_make_csum(subflow->map_seq, subflow->map_subflow_seq, subflow->map_data_len + subflow->map_data_fin, subflow->map_data_csum); if (unlikely(csum)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DATACSUMERR); return MAPPING_BAD_CSUM; } subflow->valid_csum_seen = 1; return MAPPING_OK; } static enum mapping_status get_mapping_status(struct sock *ssk, struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); bool csum_reqd = READ_ONCE(msk->csum_enabled); struct mptcp_ext *mpext; struct sk_buff *skb; u16 data_len; u64 map_seq; skb = skb_peek(&ssk->sk_receive_queue); if (!skb) return MAPPING_EMPTY; if (mptcp_check_fallback(ssk)) return MAPPING_DUMMY; mpext = mptcp_get_ext(skb); if (!mpext || !mpext->use_map) { if (!subflow->map_valid && !skb->len) { /* the TCP stack deliver 0 len FIN pkt to the receive * queue, that is the only 0len pkts ever expected here, * and we can admit no mapping only for 0 len pkts */ if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) WARN_ONCE(1, "0len seq %d:%d flags %x", TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, TCP_SKB_CB(skb)->tcp_flags); sk_eat_skb(ssk, skb); return MAPPING_EMPTY; } if (!subflow->map_valid) return MAPPING_INVALID; goto validate_seq; } trace_get_mapping_status(mpext); data_len = mpext->data_len; if (data_len == 0) { pr_debug("infinite mapping received"); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX); subflow->map_data_len = 0; return MAPPING_INVALID; } if (mpext->data_fin == 1) { if (data_len == 1) { bool updated = mptcp_update_rcv_data_fin(msk, mpext->data_seq, mpext->dsn64); pr_debug("DATA_FIN with no payload seq=%llu", mpext->data_seq); if (subflow->map_valid) { /* A DATA_FIN might arrive in a DSS * option before the previous mapping * has been fully consumed. Continue * handling the existing mapping. */ skb_ext_del(skb, SKB_EXT_MPTCP); return MAPPING_OK; } else { if (updated) mptcp_schedule_work((struct sock *)msk); return MAPPING_DATA_FIN; } } else { u64 data_fin_seq = mpext->data_seq + data_len - 1; /* If mpext->data_seq is a 32-bit value, data_fin_seq * must also be limited to 32 bits. */ if (!mpext->dsn64) data_fin_seq &= GENMASK_ULL(31, 0); mptcp_update_rcv_data_fin(msk, data_fin_seq, mpext->dsn64); pr_debug("DATA_FIN with mapping seq=%llu dsn64=%d", data_fin_seq, mpext->dsn64); } /* Adjust for DATA_FIN using 1 byte of sequence space */ data_len--; } map_seq = mptcp_expand_seq(READ_ONCE(msk->ack_seq), mpext->data_seq, mpext->dsn64); WRITE_ONCE(mptcp_sk(subflow->conn)->use_64bit_ack, !!mpext->dsn64); if (subflow->map_valid) { /* Allow replacing only with an identical map */ if (subflow->map_seq == map_seq && subflow->map_subflow_seq == mpext->subflow_seq && subflow->map_data_len == data_len && subflow->map_csum_reqd == mpext->csum_reqd) { skb_ext_del(skb, SKB_EXT_MPTCP); goto validate_csum; } /* If this skb data are fully covered by the current mapping, * the new map would need caching, which is not supported */ if (skb_is_fully_mapped(ssk, skb)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSNOMATCH); return MAPPING_INVALID; } /* will validate the next map after consuming the current one */ goto validate_csum; } subflow->map_seq = map_seq; subflow->map_subflow_seq = mpext->subflow_seq; subflow->map_data_len = data_len; subflow->map_valid = 1; subflow->map_data_fin = mpext->data_fin; subflow->mpc_map = mpext->mpc_map; subflow->map_csum_reqd = mpext->csum_reqd; subflow->map_csum_len = 0; subflow->map_data_csum = csum_unfold(mpext->csum); /* Cfr RFC 8684 Section 3.3.0 */ if (unlikely(subflow->map_csum_reqd != csum_reqd)) return MAPPING_INVALID; pr_debug("new map seq=%llu subflow_seq=%u data_len=%u csum=%d:%u", subflow->map_seq, subflow->map_subflow_seq, subflow->map_data_len, subflow->map_csum_reqd, subflow->map_data_csum); validate_seq: /* we revalidate valid mapping on new skb, because we must ensure * the current skb is completely covered by the available mapping */ if (!validate_mapping(ssk, skb)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSTCPMISMATCH); return MAPPING_INVALID; } skb_ext_del(skb, SKB_EXT_MPTCP); validate_csum: return validate_data_csum(ssk, skb, csum_reqd); } static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb, u64 limit) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); bool fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; u32 incr; incr = limit >= skb->len ? skb->len + fin : limit; pr_debug("discarding=%d len=%d seq=%d", incr, skb->len, subflow->map_subflow_seq); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DUPDATA); tcp_sk(ssk)->copied_seq += incr; if (!before(tcp_sk(ssk)->copied_seq, TCP_SKB_CB(skb)->end_seq)) sk_eat_skb(ssk, skb); if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) subflow->map_valid = 0; } /* sched mptcp worker to remove the subflow if no more data is pending */ static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ssk) { if (likely(ssk->sk_state != TCP_CLOSE)) return; if (skb_queue_empty(&ssk->sk_receive_queue) && !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) mptcp_schedule_work((struct sock *)msk); } static bool subflow_can_fallback(struct mptcp_subflow_context *subflow) { struct mptcp_sock *msk = mptcp_sk(subflow->conn); if (subflow->mp_join) return false; else if (READ_ONCE(msk->csum_enabled)) return !subflow->valid_csum_seen; else return !subflow->fully_established; } static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); unsigned long fail_tout; /* greceful failure can happen only on the MPC subflow */ if (WARN_ON_ONCE(ssk != READ_ONCE(msk->first))) return; /* since the close timeout take precedence on the fail one, * no need to start the latter when the first is already set */ if (sock_flag((struct sock *)msk, SOCK_DEAD)) return; /* we don't need extreme accuracy here, use a zero fail_tout as special * value meaning no fail timeout at all; */ fail_tout = jiffies + TCP_RTO_MAX; if (!fail_tout) fail_tout = 1; WRITE_ONCE(subflow->fail_tout, fail_tout); tcp_send_ack(ssk); mptcp_reset_tout_timer(msk, subflow->fail_tout); } static bool subflow_check_data_avail(struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); enum mapping_status status; struct mptcp_sock *msk; struct sk_buff *skb; if (!skb_peek(&ssk->sk_receive_queue)) WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA); if (subflow->data_avail) return true; msk = mptcp_sk(subflow->conn); for (;;) { u64 ack_seq; u64 old_ack; status = get_mapping_status(ssk, msk); trace_subflow_check_data_avail(status, skb_peek(&ssk->sk_receive_queue)); if (unlikely(status == MAPPING_INVALID || status == MAPPING_DUMMY || status == MAPPING_BAD_CSUM)) goto fallback; if (status != MAPPING_OK) goto no_data; skb = skb_peek(&ssk->sk_receive_queue); if (WARN_ON_ONCE(!skb)) goto no_data; if (unlikely(!READ_ONCE(msk->can_ack))) goto fallback; old_ack = READ_ONCE(msk->ack_seq); ack_seq = mptcp_subflow_get_mapped_dsn(subflow); pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack, ack_seq); if (unlikely(before64(ack_seq, old_ack))) { mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq); continue; } WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL); break; } return true; no_data: subflow_sched_work_if_closed(msk, ssk); return false; fallback: if (!__mptcp_check_fallback(msk)) { /* RFC 8684 section 3.7. */ if (status == MAPPING_BAD_CSUM && (subflow->mp_join || subflow->valid_csum_seen)) { subflow->send_mp_fail = 1; if (!READ_ONCE(msk->allow_infinite_fallback)) { subflow->reset_transient = 0; subflow->reset_reason = MPTCP_RST_EMIDDLEBOX; goto reset; } mptcp_subflow_fail(msk, ssk); WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL); return true; } if (!subflow_can_fallback(subflow) && subflow->map_data_len) { /* fatal protocol error, close the socket. * subflow_error_report() will introduce the appropriate barriers */ subflow->reset_transient = 0; subflow->reset_reason = MPTCP_RST_EMPTCP; reset: WRITE_ONCE(ssk->sk_err, EBADMSG); tcp_set_state(ssk, TCP_CLOSE); while ((skb = skb_peek(&ssk->sk_receive_queue))) sk_eat_skb(ssk, skb); tcp_send_active_reset(ssk, GFP_ATOMIC); WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA); return false; } mptcp_do_fallback(ssk); } skb = skb_peek(&ssk->sk_receive_queue); subflow->map_valid = 1; subflow->map_seq = READ_ONCE(msk->ack_seq); subflow->map_data_len = skb->len; subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq - subflow->ssn_offset; WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL); return true; } bool mptcp_subflow_data_available(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); /* check if current mapping is still valid */ if (subflow->map_valid && mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) { subflow->map_valid = 0; WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA); pr_debug("Done with mapping: seq=%u data_len=%u", subflow->map_subflow_seq, subflow->map_data_len); } return subflow_check_data_avail(sk); } /* If ssk has an mptcp parent socket, use the mptcp rcvbuf occupancy, * not the ssk one. * * In mptcp, rwin is about the mptcp-level connection data. * * Data that is still on the ssk rx queue can thus be ignored, * as far as mptcp peer is concerned that data is still inflight. * DSS ACK is updated when skb is moved to the mptcp rx queue. */ void mptcp_space(const struct sock *ssk, int *space, int *full_space) { const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); const struct sock *sk = subflow->conn; *space = __mptcp_space(sk); *full_space = mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf)); } static void subflow_error_report(struct sock *ssk) { struct sock *sk = mptcp_subflow_ctx(ssk)->conn; /* bail early if this is a no-op, so that we avoid introducing a * problematic lockdep dependency between TCP accept queue lock * and msk socket spinlock */ if (!sk->sk_socket) return; mptcp_data_lock(sk); if (!sock_owned_by_user(sk)) __mptcp_error_report(sk); else __set_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->cb_flags); mptcp_data_unlock(sk); } static void subflow_data_ready(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); u16 state = 1 << inet_sk_state_load(sk); struct sock *parent = subflow->conn; struct mptcp_sock *msk; trace_sk_data_ready(sk); msk = mptcp_sk(parent); if (state & TCPF_LISTEN) { /* MPJ subflow are removed from accept queue before reaching here, * avoid stray wakeups */ if (reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue)) return; parent->sk_data_ready(parent); return; } WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable && !subflow->mp_join && !(state & TCPF_CLOSE)); if (mptcp_subflow_data_available(sk)) mptcp_data_ready(parent, sk); else if (unlikely(sk->sk_err)) subflow_error_report(sk); } static void subflow_write_space(struct sock *ssk) { struct sock *sk = mptcp_subflow_ctx(ssk)->conn; mptcp_propagate_sndbuf(sk, ssk); mptcp_write_space(sk); } static const struct inet_connection_sock_af_ops * subflow_default_af_ops(struct sock *sk) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (sk->sk_family == AF_INET6) return &subflow_v6_specific; #endif return &subflow_specific; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) void mptcpv6_handle_mapped(struct sock *sk, bool mapped) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct inet_connection_sock *icsk = inet_csk(sk); const struct inet_connection_sock_af_ops *target; target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk); pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d", subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped); if (likely(icsk->icsk_af_ops == target)) return; subflow->icsk_af_ops = icsk->icsk_af_ops; icsk->icsk_af_ops = target; } #endif void mptcp_info2sockaddr(const struct mptcp_addr_info *info, struct sockaddr_storage *addr, unsigned short family) { memset(addr, 0, sizeof(*addr)); addr->ss_family = family; if (addr->ss_family == AF_INET) { struct sockaddr_in *in_addr = (struct sockaddr_in *)addr; if (info->family == AF_INET) in_addr->sin_addr = info->addr; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (ipv6_addr_v4mapped(&info->addr6)) in_addr->sin_addr.s_addr = info->addr6.s6_addr32[3]; #endif in_addr->sin_port = info->port; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (addr->ss_family == AF_INET6) { struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)addr; if (info->family == AF_INET) ipv6_addr_set_v4mapped(info->addr.s_addr, &in6_addr->sin6_addr); else in6_addr->sin6_addr = info->addr6; in6_addr->sin6_port = info->port; } #endif } int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, const struct mptcp_addr_info *remote) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_subflow_context *subflow; struct sockaddr_storage addr; int remote_id = remote->id; int local_id = loc->id; int err = -ENOTCONN; struct socket *sf; struct sock *ssk; u32 remote_token; int addrlen; int ifindex; u8 flags; if (!mptcp_is_fully_established(sk)) goto err_out; err = mptcp_subflow_create_socket(sk, loc->family, &sf); if (err) goto err_out; ssk = sf->sk; subflow = mptcp_subflow_ctx(ssk); do { get_random_bytes(&subflow->local_nonce, sizeof(u32)); } while (!subflow->local_nonce); if (local_id) subflow_set_local_id(subflow, local_id); mptcp_pm_get_flags_and_ifindex_by_id(msk, local_id, &flags, &ifindex); subflow->remote_key_valid = 1; subflow->remote_key = msk->remote_key; subflow->local_key = msk->local_key; subflow->token = msk->token; mptcp_info2sockaddr(loc, &addr, ssk->sk_family); addrlen = sizeof(struct sockaddr_in); #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (addr.ss_family == AF_INET6) addrlen = sizeof(struct sockaddr_in6); #endif mptcp_sockopt_sync(msk, ssk); ssk->sk_bound_dev_if = ifindex; err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen); if (err) goto failed; mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL); pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d", msk, remote_token, local_id, remote_id); subflow->remote_token = remote_token; subflow->remote_id = remote_id; subflow->request_join = 1; subflow->request_bkup = !!(flags & MPTCP_PM_ADDR_FLAG_BACKUP); subflow->subflow_id = msk->subflow_id++; mptcp_info2sockaddr(remote, &addr, ssk->sk_family); sock_hold(ssk); list_add_tail(&subflow->node, &msk->conn_list); err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK); if (err && err != -EINPROGRESS) goto failed_unlink; /* discard the subflow socket */ mptcp_sock_graft(ssk, sk->sk_socket); iput(SOCK_INODE(sf)); WRITE_ONCE(msk->allow_infinite_fallback, false); mptcp_stop_tout_timer(sk); return 0; failed_unlink: list_del(&subflow->node); sock_put(mptcp_subflow_tcp_sock(subflow)); failed: subflow->disposable = 1; sock_release(sf); err_out: /* we account subflows before the creation, and this failures will not * be caught by sk_state_change() */ mptcp_pm_close_subflow(msk); return err; } static void mptcp_attach_cgroup(struct sock *parent, struct sock *child) { #ifdef CONFIG_SOCK_CGROUP_DATA struct sock_cgroup_data *parent_skcd = &parent->sk_cgrp_data, *child_skcd = &child->sk_cgrp_data; /* only the additional subflows created by kworkers have to be modified */ if (cgroup_id(sock_cgroup_ptr(parent_skcd)) != cgroup_id(sock_cgroup_ptr(child_skcd))) { #ifdef CONFIG_MEMCG struct mem_cgroup *memcg = parent->sk_memcg; mem_cgroup_sk_free(child); if (memcg && css_tryget(&memcg->css)) child->sk_memcg = memcg; #endif /* CONFIG_MEMCG */ cgroup_sk_free(child_skcd); *child_skcd = *parent_skcd; cgroup_sk_clone(child_skcd); } #endif /* CONFIG_SOCK_CGROUP_DATA */ } static void mptcp_subflow_ops_override(struct sock *ssk) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (ssk->sk_prot == &tcpv6_prot) ssk->sk_prot = &tcpv6_prot_override; else #endif ssk->sk_prot = &tcp_prot_override; } static void mptcp_subflow_ops_undo_override(struct sock *ssk) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (ssk->sk_prot == &tcpv6_prot_override) ssk->sk_prot = &tcpv6_prot; else #endif ssk->sk_prot = &tcp_prot; } int mptcp_subflow_create_socket(struct sock *sk, unsigned short family, struct socket **new_sock) { struct mptcp_subflow_context *subflow; struct net *net = sock_net(sk); struct socket *sf; int err; /* un-accepted server sockets can reach here - on bad configuration * bail early to avoid greater trouble later */ if (unlikely(!sk->sk_socket)) return -EINVAL; err = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, &sf); if (err) return err; lock_sock_nested(sf->sk, SINGLE_DEPTH_NESTING); err = security_mptcp_add_subflow(sk, sf->sk); if (err) goto release_ssk; /* the newly created socket has to be in the same cgroup as its parent */ mptcp_attach_cgroup(sk, sf->sk); /* kernel sockets do not by default acquire net ref, but TCP timer * needs it. * Update ns_tracker to current stack trace and refcounted tracker. */ __netns_tracker_free(net, &sf->sk->ns_tracker, false); sf->sk->sk_net_refcnt = 1; get_net_track(net, &sf->sk->ns_tracker, GFP_KERNEL); sock_inuse_add(net, 1); err = tcp_set_ulp(sf->sk, "mptcp"); release_ssk: release_sock(sf->sk); if (err) { sock_release(sf); return err; } /* the newly created socket really belongs to the owning MPTCP master * socket, even if for additional subflows the allocation is performed * by a kernel workqueue. Adjust inode references, so that the * procfs/diag interfaces really show this one belonging to the correct * user. */ SOCK_INODE(sf)->i_ino = SOCK_INODE(sk->sk_socket)->i_ino; SOCK_INODE(sf)->i_uid = SOCK_INODE(sk->sk_socket)->i_uid; SOCK_INODE(sf)->i_gid = SOCK_INODE(sk->sk_socket)->i_gid; subflow = mptcp_subflow_ctx(sf->sk); pr_debug("subflow=%p", subflow); *new_sock = sf; sock_hold(sk); subflow->conn = sk; mptcp_subflow_ops_override(sf->sk); return 0; } static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk, gfp_t priority) { struct inet_connection_sock *icsk = inet_csk(sk); struct mptcp_subflow_context *ctx; ctx = kzalloc(sizeof(*ctx), priority); if (!ctx) return NULL; rcu_assign_pointer(icsk->icsk_ulp_data, ctx); INIT_LIST_HEAD(&ctx->node); INIT_LIST_HEAD(&ctx->delegated_node); pr_debug("subflow=%p", ctx); ctx->tcp_sock = sk; return ctx; } static void __subflow_state_change(struct sock *sk) { struct socket_wq *wq; rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_all(&wq->wait); rcu_read_unlock(); } static bool subflow_is_done(const struct sock *sk) { return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE; } static void subflow_state_change(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct sock *parent = subflow->conn; struct mptcp_sock *msk; __subflow_state_change(sk); msk = mptcp_sk(parent); if (subflow_simultaneous_connect(sk)) { mptcp_propagate_sndbuf(parent, sk); mptcp_do_fallback(sk); mptcp_rcv_space_init(msk, sk); pr_fallback(msk); subflow->conn_finished = 1; mptcp_set_connected(parent); } /* as recvmsg() does not acquire the subflow socket for ssk selection * a fin packet carrying a DSS can be unnoticed if we don't trigger * the data available machinery here. */ if (mptcp_subflow_data_available(sk)) mptcp_data_ready(parent, sk); else if (unlikely(sk->sk_err)) subflow_error_report(sk); subflow_sched_work_if_closed(mptcp_sk(parent), sk); /* when the fallback subflow closes the rx side, trigger a 'dummy' * ingress data fin, so that the msk state will follow along */ if (__mptcp_check_fallback(msk) && subflow_is_done(sk) && msk->first == sk && mptcp_update_rcv_data_fin(msk, READ_ONCE(msk->ack_seq), true)) mptcp_schedule_work(parent); } void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk) { struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue; struct request_sock *req, *head, *tail; struct mptcp_subflow_context *subflow; struct sock *sk, *ssk; /* Due to lock dependencies no relevant lock can be acquired under rskq_lock. * Splice the req list, so that accept() can not reach the pending ssk after * the listener socket is released below. */ spin_lock_bh(&queue->rskq_lock); head = queue->rskq_accept_head; tail = queue->rskq_accept_tail; queue->rskq_accept_head = NULL; queue->rskq_accept_tail = NULL; spin_unlock_bh(&queue->rskq_lock); if (!head) return; /* can't acquire the msk socket lock under the subflow one, * or will cause ABBA deadlock */ release_sock(listener_ssk); for (req = head; req; req = req->dl_next) { ssk = req->sk; if (!sk_is_mptcp(ssk)) continue; subflow = mptcp_subflow_ctx(ssk); if (!subflow || !subflow->conn) continue; sk = subflow->conn; sock_hold(sk); lock_sock_nested(sk, SINGLE_DEPTH_NESTING); __mptcp_unaccepted_force_close(sk); release_sock(sk); /* lockdep will report a false positive ABBA deadlock * between cancel_work_sync and the listener socket. * The involved locks belong to different sockets WRT * the existing AB chain. * Using a per socket key is problematic as key * deregistration requires process context and must be * performed at socket disposal time, in atomic * context. * Just tell lockdep to consider the listener socket * released here. */ mutex_release(&listener_sk->sk_lock.dep_map, _RET_IP_); mptcp_cancel_work(sk); mutex_acquire(&listener_sk->sk_lock.dep_map, 0, 0, _RET_IP_); sock_put(sk); } /* we are still under the listener msk socket lock */ lock_sock_nested(listener_ssk, SINGLE_DEPTH_NESTING); /* restore the listener queue, to let the TCP code clean it up */ spin_lock_bh(&queue->rskq_lock); WARN_ON_ONCE(queue->rskq_accept_head); queue->rskq_accept_head = head; queue->rskq_accept_tail = tail; spin_unlock_bh(&queue->rskq_lock); } static int subflow_ulp_init(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct mptcp_subflow_context *ctx; struct tcp_sock *tp = tcp_sk(sk); int err = 0; /* disallow attaching ULP to a socket unless it has been * created with sock_create_kern() */ if (!sk->sk_kern_sock) { err = -EOPNOTSUPP; goto out; } ctx = subflow_create_ctx(sk, GFP_KERNEL); if (!ctx) { err = -ENOMEM; goto out; } pr_debug("subflow=%p, family=%d", ctx, sk->sk_family); tp->is_mptcp = 1; ctx->icsk_af_ops = icsk->icsk_af_ops; icsk->icsk_af_ops = subflow_default_af_ops(sk); ctx->tcp_state_change = sk->sk_state_change; ctx->tcp_error_report = sk->sk_error_report; WARN_ON_ONCE(sk->sk_data_ready != sock_def_readable); WARN_ON_ONCE(sk->sk_write_space != sk_stream_write_space); sk->sk_data_ready = subflow_data_ready; sk->sk_write_space = subflow_write_space; sk->sk_state_change = subflow_state_change; sk->sk_error_report = subflow_error_report; out: return err; } static void subflow_ulp_release(struct sock *ssk) { struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk); bool release = true; struct sock *sk; if (!ctx) return; sk = ctx->conn; if (sk) { /* if the msk has been orphaned, keep the ctx * alive, will be freed by __mptcp_close_ssk(), * when the subflow is still unaccepted */ release = ctx->disposable || list_empty(&ctx->node); /* inet_child_forget() does not call sk_state_change(), * explicitly trigger the socket close machinery */ if (!release && !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags)) mptcp_schedule_work(sk); sock_put(sk); } mptcp_subflow_ops_undo_override(ssk); if (release) kfree_rcu(ctx, rcu); } static void subflow_ulp_clone(const struct request_sock *req, struct sock *newsk, const gfp_t priority) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk); struct mptcp_subflow_context *new_ctx; if (!tcp_rsk(req)->is_mptcp || (!subflow_req->mp_capable && !subflow_req->mp_join)) { subflow_ulp_fallback(newsk, old_ctx); return; } new_ctx = subflow_create_ctx(newsk, priority); if (!new_ctx) { subflow_ulp_fallback(newsk, old_ctx); return; } new_ctx->conn_finished = 1; new_ctx->icsk_af_ops = old_ctx->icsk_af_ops; new_ctx->tcp_state_change = old_ctx->tcp_state_change; new_ctx->tcp_error_report = old_ctx->tcp_error_report; new_ctx->rel_write_seq = 1; new_ctx->tcp_sock = newsk; if (subflow_req->mp_capable) { /* see comments in subflow_syn_recv_sock(), MPTCP connection * is fully established only after we receive the remote key */ new_ctx->mp_capable = 1; new_ctx->local_key = subflow_req->local_key; new_ctx->token = subflow_req->token; new_ctx->ssn_offset = subflow_req->ssn_offset; new_ctx->idsn = subflow_req->idsn; /* this is the first subflow, id is always 0 */ new_ctx->local_id_valid = 1; } else if (subflow_req->mp_join) { new_ctx->ssn_offset = subflow_req->ssn_offset; new_ctx->mp_join = 1; new_ctx->fully_established = 1; new_ctx->remote_key_valid = 1; new_ctx->backup = subflow_req->backup; new_ctx->remote_id = subflow_req->remote_id; new_ctx->token = subflow_req->token; new_ctx->thmac = subflow_req->thmac; /* the subflow req id is valid, fetched via subflow_check_req() * and subflow_token_join_request() */ subflow_set_local_id(new_ctx, subflow_req->local_id); } } static void tcp_release_cb_override(struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); long status; /* process and clear all the pending actions, but leave the subflow into * the napi queue. To respect locking, only the same CPU that originated * the action can touch the list. mptcp_napi_poll will take care of it. */ status = set_mask_bits(&subflow->delegated_status, MPTCP_DELEGATE_ACTIONS_MASK, 0); if (status) mptcp_subflow_process_delegated(ssk, status); tcp_release_cb(ssk); } static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = { .name = "mptcp", .owner = THIS_MODULE, .init = subflow_ulp_init, .release = subflow_ulp_release, .clone = subflow_ulp_clone, }; static int subflow_ops_init(struct request_sock_ops *subflow_ops) { subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock); subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name, subflow_ops->obj_size, 0, SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU, NULL); if (!subflow_ops->slab) return -ENOMEM; return 0; } void __init mptcp_subflow_init(void) { mptcp_subflow_v4_request_sock_ops = tcp_request_sock_ops; mptcp_subflow_v4_request_sock_ops.slab_name = "request_sock_subflow_v4"; mptcp_subflow_v4_request_sock_ops.destructor = subflow_v4_req_destructor; if (subflow_ops_init(&mptcp_subflow_v4_request_sock_ops) != 0) panic("MPTCP: failed to init subflow v4 request sock ops\n"); subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops; subflow_request_sock_ipv4_ops.route_req = subflow_v4_route_req; subflow_request_sock_ipv4_ops.send_synack = subflow_v4_send_synack; subflow_specific = ipv4_specific; subflow_specific.conn_request = subflow_v4_conn_request; subflow_specific.syn_recv_sock = subflow_syn_recv_sock; subflow_specific.sk_rx_dst_set = subflow_finish_connect; subflow_specific.rebuild_header = subflow_rebuild_header; tcp_prot_override = tcp_prot; tcp_prot_override.release_cb = tcp_release_cb_override; #if IS_ENABLED(CONFIG_MPTCP_IPV6) /* In struct mptcp_subflow_request_sock, we assume the TCP request sock * structures for v4 and v6 have the same size. It should not changed in * the future but better to make sure to be warned if it is no longer * the case. */ BUILD_BUG_ON(sizeof(struct tcp_request_sock) != sizeof(struct tcp6_request_sock)); mptcp_subflow_v6_request_sock_ops = tcp6_request_sock_ops; mptcp_subflow_v6_request_sock_ops.slab_name = "request_sock_subflow_v6"; mptcp_subflow_v6_request_sock_ops.destructor = subflow_v6_req_destructor; if (subflow_ops_init(&mptcp_subflow_v6_request_sock_ops) != 0) panic("MPTCP: failed to init subflow v6 request sock ops\n"); subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; subflow_request_sock_ipv6_ops.route_req = subflow_v6_route_req; subflow_request_sock_ipv6_ops.send_synack = subflow_v6_send_synack; subflow_v6_specific = ipv6_specific; subflow_v6_specific.conn_request = subflow_v6_conn_request; subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock; subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect; subflow_v6_specific.rebuild_header = subflow_v6_rebuild_header; subflow_v6m_specific = subflow_v6_specific; subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit; subflow_v6m_specific.send_check = ipv4_specific.send_check; subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len; subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced; subflow_v6m_specific.net_frag_header_len = 0; subflow_v6m_specific.rebuild_header = subflow_rebuild_header; tcpv6_prot_override = tcpv6_prot; tcpv6_prot_override.release_cb = tcp_release_cb_override; #endif mptcp_diag_subflow_init(&subflow_ulp_ops); if (tcp_register_ulp(&subflow_ulp_ops) != 0) panic("MPTCP: failed to register subflows to ULP\n"); } |
1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 | /* * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved. * Copyright (c) 2016-2017, Dave Watson <davejwatson@fb.com>. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef _TLS_OFFLOAD_H #define _TLS_OFFLOAD_H #include <linux/types.h> #include <asm/byteorder.h> #include <linux/crypto.h> #include <linux/socket.h> #include <linux/tcp.h> #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/rcupdate.h> #include <net/net_namespace.h> #include <net/tcp.h> #include <net/strparser.h> #include <crypto/aead.h> #include <uapi/linux/tls.h> struct tls_rec; /* Maximum data size carried in a TLS record */ #define TLS_MAX_PAYLOAD_SIZE ((size_t)1 << 14) #define TLS_HEADER_SIZE 5 #define TLS_NONCE_OFFSET TLS_HEADER_SIZE #define TLS_CRYPTO_INFO_READY(info) ((info)->cipher_type) #define TLS_AAD_SPACE_SIZE 13 #define MAX_IV_SIZE 16 #define TLS_TAG_SIZE 16 #define TLS_MAX_REC_SEQ_SIZE 8 #define TLS_MAX_AAD_SIZE TLS_AAD_SPACE_SIZE /* For CCM mode, the full 16-bytes of IV is made of '4' fields of given sizes. * * IV[16] = b0[1] || implicit nonce[4] || explicit nonce[8] || length[3] * * The field 'length' is encoded in field 'b0' as '(length width - 1)'. * Hence b0 contains (3 - 1) = 2. */ #define TLS_AES_CCM_IV_B0_BYTE 2 #define TLS_SM4_CCM_IV_B0_BYTE 2 enum { TLS_BASE, TLS_SW, TLS_HW, TLS_HW_RECORD, TLS_NUM_CONFIG, }; struct tx_work { struct delayed_work work; struct sock *sk; }; struct tls_sw_context_tx { struct crypto_aead *aead_send; struct crypto_wait async_wait; struct tx_work tx_work; struct tls_rec *open_rec; struct list_head tx_list; atomic_t encrypt_pending; /* protect crypto_wait with encrypt_pending */ spinlock_t encrypt_compl_lock; int async_notify; u8 async_capable:1; #define BIT_TX_SCHEDULED 0 #define BIT_TX_CLOSING 1 unsigned long tx_bitmask; }; struct tls_strparser { struct sock *sk; u32 mark : 8; u32 stopped : 1; u32 copy_mode : 1; u32 mixed_decrypted : 1; u32 msg_ready : 1; struct strp_msg stm; struct sk_buff *anchor; struct work_struct work; }; struct tls_sw_context_rx { struct crypto_aead *aead_recv; struct crypto_wait async_wait; struct sk_buff_head rx_list; /* list of decrypted 'data' records */ void (*saved_data_ready)(struct sock *sk); u8 reader_present; u8 async_capable:1; u8 zc_capable:1; u8 reader_contended:1; struct tls_strparser strp; atomic_t decrypt_pending; /* protect crypto_wait with decrypt_pending*/ spinlock_t decrypt_compl_lock; struct sk_buff_head async_hold; struct wait_queue_head wq; }; struct tls_record_info { struct list_head list; u32 end_seq; int len; int num_frags; skb_frag_t frags[MAX_SKB_FRAGS]; }; struct tls_offload_context_tx { struct crypto_aead *aead_send; spinlock_t lock; /* protects records list */ struct list_head records_list; struct tls_record_info *open_record; struct tls_record_info *retransmit_hint; u64 hint_record_sn; u64 unacked_record_sn; struct scatterlist sg_tx_data[MAX_SKB_FRAGS]; void (*sk_destruct)(struct sock *sk); struct work_struct destruct_work; struct tls_context *ctx; u8 driver_state[] __aligned(8); /* The TLS layer reserves room for driver specific state * Currently the belief is that there is not enough * driver specific state to justify another layer of indirection */ #define TLS_DRIVER_STATE_SIZE_TX 16 }; #define TLS_OFFLOAD_CONTEXT_SIZE_TX \ (sizeof(struct tls_offload_context_tx) + TLS_DRIVER_STATE_SIZE_TX) enum tls_context_flags { /* tls_device_down was called after the netdev went down, device state * was released, and kTLS works in software, even though rx_conf is * still TLS_HW (needed for transition). */ TLS_RX_DEV_DEGRADED = 0, /* Unlike RX where resync is driven entirely by the core in TX only * the driver knows when things went out of sync, so we need the flag * to be atomic. */ TLS_TX_SYNC_SCHED = 1, /* tls_dev_del was called for the RX side, device state was released, * but tls_ctx->netdev might still be kept, because TX-side driver * resources might not be released yet. Used to prevent the second * tls_dev_del call in tls_device_down if it happens simultaneously. */ TLS_RX_DEV_CLOSED = 2, }; struct cipher_context { char *iv; char *rec_seq; }; union tls_crypto_context { struct tls_crypto_info info; union { struct tls12_crypto_info_aes_gcm_128 aes_gcm_128; struct tls12_crypto_info_aes_gcm_256 aes_gcm_256; struct tls12_crypto_info_chacha20_poly1305 chacha20_poly1305; struct tls12_crypto_info_sm4_gcm sm4_gcm; struct tls12_crypto_info_sm4_ccm sm4_ccm; }; }; struct tls_prot_info { u16 version; u16 cipher_type; u16 prepend_size; u16 tag_size; u16 overhead_size; u16 iv_size; u16 salt_size; u16 rec_seq_size; u16 aad_size; u16 tail_size; }; struct tls_context { /* read-only cache line */ struct tls_prot_info prot_info; u8 tx_conf:3; u8 rx_conf:3; u8 zerocopy_sendfile:1; u8 rx_no_pad:1; int (*push_pending_record)(struct sock *sk, int flags); void (*sk_write_space)(struct sock *sk); void *priv_ctx_tx; void *priv_ctx_rx; struct net_device __rcu *netdev; /* rw cache line */ struct cipher_context tx; struct cipher_context rx; struct scatterlist *partially_sent_record; u16 partially_sent_offset; bool splicing_pages; bool pending_open_record_frags; struct mutex tx_lock; /* protects partially_sent_* fields and * per-type TX fields */ unsigned long flags; /* cache cold stuff */ struct proto *sk_proto; struct sock *sk; void (*sk_destruct)(struct sock *sk); union tls_crypto_context crypto_send; union tls_crypto_context crypto_recv; struct list_head list; refcount_t refcount; struct rcu_head rcu; }; enum tls_offload_ctx_dir { TLS_OFFLOAD_CTX_DIR_RX, TLS_OFFLOAD_CTX_DIR_TX, }; struct tlsdev_ops { int (*tls_dev_add)(struct net_device *netdev, struct sock *sk, enum tls_offload_ctx_dir direction, struct tls_crypto_info *crypto_info, u32 start_offload_tcp_sn); void (*tls_dev_del)(struct net_device *netdev, struct tls_context *ctx, enum tls_offload_ctx_dir direction); int (*tls_dev_resync)(struct net_device *netdev, struct sock *sk, u32 seq, u8 *rcd_sn, enum tls_offload_ctx_dir direction); }; enum tls_offload_sync_type { TLS_OFFLOAD_SYNC_TYPE_DRIVER_REQ = 0, TLS_OFFLOAD_SYNC_TYPE_CORE_NEXT_HINT = 1, TLS_OFFLOAD_SYNC_TYPE_DRIVER_REQ_ASYNC = 2, }; #define TLS_DEVICE_RESYNC_NH_START_IVAL 2 #define TLS_DEVICE_RESYNC_NH_MAX_IVAL 128 #define TLS_DEVICE_RESYNC_ASYNC_LOGMAX 13 struct tls_offload_resync_async { atomic64_t req; u16 loglen; u16 rcd_delta; u32 log[TLS_DEVICE_RESYNC_ASYNC_LOGMAX]; }; struct tls_offload_context_rx { /* sw must be the first member of tls_offload_context_rx */ struct tls_sw_context_rx sw; enum tls_offload_sync_type resync_type; /* this member is set regardless of resync_type, to avoid branches */ u8 resync_nh_reset:1; /* CORE_NEXT_HINT-only member, but use the hole here */ u8 resync_nh_do_now:1; union { /* TLS_OFFLOAD_SYNC_TYPE_DRIVER_REQ */ struct { atomic64_t resync_req; }; /* TLS_OFFLOAD_SYNC_TYPE_CORE_NEXT_HINT */ struct { u32 decrypted_failed; u32 decrypted_tgt; } resync_nh; /* TLS_OFFLOAD_SYNC_TYPE_DRIVER_REQ_ASYNC */ struct { struct tls_offload_resync_async *resync_async; }; }; u8 driver_state[] __aligned(8); /* The TLS layer reserves room for driver specific state * Currently the belief is that there is not enough * driver specific state to justify another layer of indirection */ #define TLS_DRIVER_STATE_SIZE_RX 8 }; #define TLS_OFFLOAD_CONTEXT_SIZE_RX \ (sizeof(struct tls_offload_context_rx) + TLS_DRIVER_STATE_SIZE_RX) struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context, u32 seq, u64 *p_record_sn); static inline bool tls_record_is_start_marker(struct tls_record_info *rec) { return rec->len == 0; } static inline u32 tls_record_start_seq(struct tls_record_info *rec) { return rec->end_seq - rec->len; } struct sk_buff * tls_validate_xmit_skb(struct sock *sk, struct net_device *dev, struct sk_buff *skb); struct sk_buff * tls_validate_xmit_skb_sw(struct sock *sk, struct net_device *dev, struct sk_buff *skb); static inline bool tls_is_skb_tx_device_offloaded(const struct sk_buff *skb) { #ifdef CONFIG_TLS_DEVICE struct sock *sk = skb->sk; return sk && sk_fullsock(sk) && (smp_load_acquire(&sk->sk_validate_xmit_skb) == &tls_validate_xmit_skb); #else return false; #endif } static inline struct tls_context *tls_get_ctx(const struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); /* Use RCU on icsk_ulp_data only for sock diag code, * TLS data path doesn't need rcu_dereference(). */ return (__force void *)icsk->icsk_ulp_data; } static inline struct tls_sw_context_rx *tls_sw_ctx_rx( const struct tls_context *tls_ctx) { return (struct tls_sw_context_rx *)tls_ctx->priv_ctx_rx; } static inline struct tls_sw_context_tx *tls_sw_ctx_tx( const struct tls_context *tls_ctx) { return (struct tls_sw_context_tx *)tls_ctx->priv_ctx_tx; } static inline struct tls_offload_context_tx * tls_offload_ctx_tx(const struct tls_context *tls_ctx) { return (struct tls_offload_context_tx *)tls_ctx->priv_ctx_tx; } static inline bool tls_sw_has_ctx_tx(const struct sock *sk) { struct tls_context *ctx = tls_get_ctx(sk); if (!ctx) return false; return !!tls_sw_ctx_tx(ctx); } static inline bool tls_sw_has_ctx_rx(const struct sock *sk) { struct tls_context *ctx = tls_get_ctx(sk); if (!ctx) return false; return !!tls_sw_ctx_rx(ctx); } static inline struct tls_offload_context_rx * tls_offload_ctx_rx(const struct tls_context *tls_ctx) { return (struct tls_offload_context_rx *)tls_ctx->priv_ctx_rx; } static inline void *__tls_driver_ctx(struct tls_context *tls_ctx, enum tls_offload_ctx_dir direction) { if (direction == TLS_OFFLOAD_CTX_DIR_TX) return tls_offload_ctx_tx(tls_ctx)->driver_state; else return tls_offload_ctx_rx(tls_ctx)->driver_state; } static inline void * tls_driver_ctx(const struct sock *sk, enum tls_offload_ctx_dir direction) { return __tls_driver_ctx(tls_get_ctx(sk), direction); } #define RESYNC_REQ BIT(0) #define RESYNC_REQ_ASYNC BIT(1) /* The TLS context is valid until sk_destruct is called */ static inline void tls_offload_rx_resync_request(struct sock *sk, __be32 seq) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx); atomic64_set(&rx_ctx->resync_req, ((u64)ntohl(seq) << 32) | RESYNC_REQ); } /* Log all TLS record header TCP sequences in [seq, seq+len] */ static inline void tls_offload_rx_resync_async_request_start(struct sock *sk, __be32 seq, u16 len) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx); atomic64_set(&rx_ctx->resync_async->req, ((u64)ntohl(seq) << 32) | ((u64)len << 16) | RESYNC_REQ | RESYNC_REQ_ASYNC); rx_ctx->resync_async->loglen = 0; rx_ctx->resync_async->rcd_delta = 0; } static inline void tls_offload_rx_resync_async_request_end(struct sock *sk, __be32 seq) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx); atomic64_set(&rx_ctx->resync_async->req, ((u64)ntohl(seq) << 32) | RESYNC_REQ); } static inline void tls_offload_rx_resync_set_type(struct sock *sk, enum tls_offload_sync_type type) { struct tls_context *tls_ctx = tls_get_ctx(sk); tls_offload_ctx_rx(tls_ctx)->resync_type = type; } /* Driver's seq tracking has to be disabled until resync succeeded */ static inline bool tls_offload_tx_resync_pending(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); bool ret; ret = test_bit(TLS_TX_SYNC_SCHED, &tls_ctx->flags); smp_mb__after_atomic(); return ret; } struct sk_buff *tls_encrypt_skb(struct sk_buff *skb); #ifdef CONFIG_TLS_DEVICE void tls_device_sk_destruct(struct sock *sk); void tls_offload_tx_resync_request(struct sock *sk, u32 got_seq, u32 exp_seq); static inline bool tls_is_sk_rx_device_offloaded(struct sock *sk) { if (!sk_fullsock(sk) || smp_load_acquire(&sk->sk_destruct) != tls_device_sk_destruct) return false; return tls_get_ctx(sk)->rx_conf == TLS_HW; } #endif #endif /* _TLS_OFFLOAD_H */ |
20 20 20 20 20 20 20 20 20 20 20 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 | // SPDX-License-Identifier: GPL-2.0 /* Copyright 2011-2014 Autronica Fire and Security AS * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se * * The HSR spec says never to forward the same frame twice on the same * interface. A frame is identified by its source MAC address and its HSR * sequence number. This code keeps track of senders and their sequence numbers * to allow filtering of duplicate frames, and to detect HSR ring errors. * Same code handles filtering of duplicates for PRP as well. */ #include <linux/if_ether.h> #include <linux/etherdevice.h> #include <linux/slab.h> #include <linux/rculist.h> #include "hsr_main.h" #include "hsr_framereg.h" #include "hsr_netlink.h" /* seq_nr_after(a, b) - return true if a is after (higher in sequence than) b, * false otherwise. */ static bool seq_nr_after(u16 a, u16 b) { /* Remove inconsistency where * seq_nr_after(a, b) == seq_nr_before(a, b) */ if ((int)b - a == 32768) return false; return (((s16)(b - a)) < 0); } #define seq_nr_before(a, b) seq_nr_after((b), (a)) #define seq_nr_before_or_eq(a, b) (!seq_nr_after((a), (b))) bool hsr_addr_is_self(struct hsr_priv *hsr, unsigned char *addr) { struct hsr_self_node *sn; bool ret = false; rcu_read_lock(); sn = rcu_dereference(hsr->self_node); if (!sn) { WARN_ONCE(1, "HSR: No self node\n"); goto out; } if (ether_addr_equal(addr, sn->macaddress_A) || ether_addr_equal(addr, sn->macaddress_B)) ret = true; out: rcu_read_unlock(); return ret; } /* Search for mac entry. Caller must hold rcu read lock. */ static struct hsr_node *find_node_by_addr_A(struct list_head *node_db, const unsigned char addr[ETH_ALEN]) { struct hsr_node *node; list_for_each_entry_rcu(node, node_db, mac_list) { if (ether_addr_equal(node->macaddress_A, addr)) return node; } return NULL; } /* Helper for device init; the self_node is used in hsr_rcv() to recognize * frames from self that's been looped over the HSR ring. */ int hsr_create_self_node(struct hsr_priv *hsr, const unsigned char addr_a[ETH_ALEN], const unsigned char addr_b[ETH_ALEN]) { struct hsr_self_node *sn, *old; sn = kmalloc(sizeof(*sn), GFP_KERNEL); if (!sn) return -ENOMEM; ether_addr_copy(sn->macaddress_A, addr_a); ether_addr_copy(sn->macaddress_B, addr_b); spin_lock_bh(&hsr->list_lock); old = rcu_replace_pointer(hsr->self_node, sn, lockdep_is_held(&hsr->list_lock)); spin_unlock_bh(&hsr->list_lock); if (old) kfree_rcu(old, rcu_head); return 0; } void hsr_del_self_node(struct hsr_priv *hsr) { struct hsr_self_node *old; spin_lock_bh(&hsr->list_lock); old = rcu_replace_pointer(hsr->self_node, NULL, lockdep_is_held(&hsr->list_lock)); spin_unlock_bh(&hsr->list_lock); if (old) kfree_rcu(old, rcu_head); } void hsr_del_nodes(struct list_head *node_db) { struct hsr_node *node; struct hsr_node *tmp; list_for_each_entry_safe(node, tmp, node_db, mac_list) kfree(node); } void prp_handle_san_frame(bool san, enum hsr_port_type port, struct hsr_node *node) { /* Mark if the SAN node is over LAN_A or LAN_B */ if (port == HSR_PT_SLAVE_A) { node->san_a = true; return; } if (port == HSR_PT_SLAVE_B) node->san_b = true; } /* Allocate an hsr_node and add it to node_db. 'addr' is the node's address_A; * seq_out is used to initialize filtering of outgoing duplicate frames * originating from the newly added node. */ static struct hsr_node *hsr_add_node(struct hsr_priv *hsr, struct list_head *node_db, unsigned char addr[], u16 seq_out, bool san, enum hsr_port_type rx_port) { struct hsr_node *new_node, *node; unsigned long now; int i; new_node = kzalloc(sizeof(*new_node), GFP_ATOMIC); if (!new_node) return NULL; ether_addr_copy(new_node->macaddress_A, addr); spin_lock_init(&new_node->seq_out_lock); /* We are only interested in time diffs here, so use current jiffies * as initialization. (0 could trigger an spurious ring error warning). */ now = jiffies; for (i = 0; i < HSR_PT_PORTS; i++) { new_node->time_in[i] = now; new_node->time_out[i] = now; } for (i = 0; i < HSR_PT_PORTS; i++) new_node->seq_out[i] = seq_out; if (san && hsr->proto_ops->handle_san_frame) hsr->proto_ops->handle_san_frame(san, rx_port, new_node); spin_lock_bh(&hsr->list_lock); list_for_each_entry_rcu(node, node_db, mac_list, lockdep_is_held(&hsr->list_lock)) { if (ether_addr_equal(node->macaddress_A, addr)) goto out; if (ether_addr_equal(node->macaddress_B, addr)) goto out; } list_add_tail_rcu(&new_node->mac_list, node_db); spin_unlock_bh(&hsr->list_lock); return new_node; out: spin_unlock_bh(&hsr->list_lock); kfree(new_node); return node; } void prp_update_san_info(struct hsr_node *node, bool is_sup) { if (!is_sup) return; node->san_a = false; node->san_b = false; } /* Get the hsr_node from which 'skb' was sent. */ struct hsr_node *hsr_get_node(struct hsr_port *port, struct list_head *node_db, struct sk_buff *skb, bool is_sup, enum hsr_port_type rx_port) { struct hsr_priv *hsr = port->hsr; struct hsr_node *node; struct ethhdr *ethhdr; struct prp_rct *rct; bool san = false; u16 seq_out; if (!skb_mac_header_was_set(skb)) return NULL; ethhdr = (struct ethhdr *)skb_mac_header(skb); list_for_each_entry_rcu(node, node_db, mac_list) { if (ether_addr_equal(node->macaddress_A, ethhdr->h_source)) { if (hsr->proto_ops->update_san_info) hsr->proto_ops->update_san_info(node, is_sup); return node; } if (ether_addr_equal(node->macaddress_B, ethhdr->h_source)) { if (hsr->proto_ops->update_san_info) hsr->proto_ops->update_san_info(node, is_sup); return node; } } /* Everyone may create a node entry, connected node to a HSR/PRP * device. */ if (ethhdr->h_proto == htons(ETH_P_PRP) || ethhdr->h_proto == htons(ETH_P_HSR)) { /* Use the existing sequence_nr from the tag as starting point * for filtering duplicate frames. */ seq_out = hsr_get_skb_sequence_nr(skb) - 1; } else { rct = skb_get_PRP_rct(skb); if (rct && prp_check_lsdu_size(skb, rct, is_sup)) { seq_out = prp_get_skb_sequence_nr(rct); } else { if (rx_port != HSR_PT_MASTER) san = true; seq_out = HSR_SEQNR_START; } } return hsr_add_node(hsr, node_db, ethhdr->h_source, seq_out, san, rx_port); } /* Use the Supervision frame's info about an eventual macaddress_B for merging * nodes that has previously had their macaddress_B registered as a separate * node. */ void hsr_handle_sup_frame(struct hsr_frame_info *frame) { struct hsr_node *node_curr = frame->node_src; struct hsr_port *port_rcv = frame->port_rcv; struct hsr_priv *hsr = port_rcv->hsr; struct hsr_sup_payload *hsr_sp; struct hsr_sup_tlv *hsr_sup_tlv; struct hsr_node *node_real; struct sk_buff *skb = NULL; struct list_head *node_db; struct ethhdr *ethhdr; int i; unsigned int pull_size = 0; unsigned int total_pull_size = 0; /* Here either frame->skb_hsr or frame->skb_prp should be * valid as supervision frame always will have protocol * header info. */ if (frame->skb_hsr) skb = frame->skb_hsr; else if (frame->skb_prp) skb = frame->skb_prp; else if (frame->skb_std) skb = frame->skb_std; if (!skb) return; /* Leave the ethernet header. */ pull_size = sizeof(struct ethhdr); skb_pull(skb, pull_size); total_pull_size += pull_size; ethhdr = (struct ethhdr *)skb_mac_header(skb); /* And leave the HSR tag. */ if (ethhdr->h_proto == htons(ETH_P_HSR)) { pull_size = sizeof(struct hsr_tag); skb_pull(skb, pull_size); total_pull_size += pull_size; } /* And leave the HSR sup tag. */ pull_size = sizeof(struct hsr_sup_tag); skb_pull(skb, pull_size); total_pull_size += pull_size; /* get HSR sup payload */ hsr_sp = (struct hsr_sup_payload *)skb->data; /* Merge node_curr (registered on macaddress_B) into node_real */ node_db = &port_rcv->hsr->node_db; node_real = find_node_by_addr_A(node_db, hsr_sp->macaddress_A); if (!node_real) /* No frame received from AddrA of this node yet */ node_real = hsr_add_node(hsr, node_db, hsr_sp->macaddress_A, HSR_SEQNR_START - 1, true, port_rcv->type); if (!node_real) goto done; /* No mem */ if (node_real == node_curr) /* Node has already been merged */ goto done; /* Leave the first HSR sup payload. */ pull_size = sizeof(struct hsr_sup_payload); skb_pull(skb, pull_size); total_pull_size += pull_size; /* Get second supervision tlv */ hsr_sup_tlv = (struct hsr_sup_tlv *)skb->data; /* And check if it is a redbox mac TLV */ if (hsr_sup_tlv->HSR_TLV_type == PRP_TLV_REDBOX_MAC) { /* We could stop here after pushing hsr_sup_payload, * or proceed and allow macaddress_B and for redboxes. */ /* Sanity check length */ if (hsr_sup_tlv->HSR_TLV_length != 6) goto done; /* Leave the second HSR sup tlv. */ pull_size = sizeof(struct hsr_sup_tlv); skb_pull(skb, pull_size); total_pull_size += pull_size; /* Get redbox mac address. */ hsr_sp = (struct hsr_sup_payload *)skb->data; /* Check if redbox mac and node mac are equal. */ if (!ether_addr_equal(node_real->macaddress_A, hsr_sp->macaddress_A)) { /* This is a redbox supervision frame for a VDAN! */ goto done; } } ether_addr_copy(node_real->macaddress_B, ethhdr->h_source); spin_lock_bh(&node_real->seq_out_lock); for (i = 0; i < HSR_PT_PORTS; i++) { if (!node_curr->time_in_stale[i] && time_after(node_curr->time_in[i], node_real->time_in[i])) { node_real->time_in[i] = node_curr->time_in[i]; node_real->time_in_stale[i] = node_curr->time_in_stale[i]; } if (seq_nr_after(node_curr->seq_out[i], node_real->seq_out[i])) node_real->seq_out[i] = node_curr->seq_out[i]; } spin_unlock_bh(&node_real->seq_out_lock); node_real->addr_B_port = port_rcv->type; spin_lock_bh(&hsr->list_lock); if (!node_curr->removed) { list_del_rcu(&node_curr->mac_list); node_curr->removed = true; kfree_rcu(node_curr, rcu_head); } spin_unlock_bh(&hsr->list_lock); done: /* Push back here */ skb_push(skb, total_pull_size); } /* 'skb' is a frame meant for this host, that is to be passed to upper layers. * * If the frame was sent by a node's B interface, replace the source * address with that node's "official" address (macaddress_A) so that upper * layers recognize where it came from. */ void hsr_addr_subst_source(struct hsr_node *node, struct sk_buff *skb) { if (!skb_mac_header_was_set(skb)) { WARN_ONCE(1, "%s: Mac header not set\n", __func__); return; } memcpy(ð_hdr(skb)->h_source, node->macaddress_A, ETH_ALEN); } /* 'skb' is a frame meant for another host. * 'port' is the outgoing interface * * Substitute the target (dest) MAC address if necessary, so the it matches the * recipient interface MAC address, regardless of whether that is the * recipient's A or B interface. * This is needed to keep the packets flowing through switches that learn on * which "side" the different interfaces are. */ void hsr_addr_subst_dest(struct hsr_node *node_src, struct sk_buff *skb, struct hsr_port *port) { struct hsr_node *node_dst; if (!skb_mac_header_was_set(skb)) { WARN_ONCE(1, "%s: Mac header not set\n", __func__); return; } if (!is_unicast_ether_addr(eth_hdr(skb)->h_dest)) return; node_dst = find_node_by_addr_A(&port->hsr->node_db, eth_hdr(skb)->h_dest); if (!node_dst) { if (port->hsr->prot_version != PRP_V1 && net_ratelimit()) netdev_err(skb->dev, "%s: Unknown node\n", __func__); return; } if (port->type != node_dst->addr_B_port) return; if (is_valid_ether_addr(node_dst->macaddress_B)) ether_addr_copy(eth_hdr(skb)->h_dest, node_dst->macaddress_B); } void hsr_register_frame_in(struct hsr_node *node, struct hsr_port *port, u16 sequence_nr) { /* Don't register incoming frames without a valid sequence number. This * ensures entries of restarted nodes gets pruned so that they can * re-register and resume communications. */ if (!(port->dev->features & NETIF_F_HW_HSR_TAG_RM) && seq_nr_before(sequence_nr, node->seq_out[port->type])) return; node->time_in[port->type] = jiffies; node->time_in_stale[port->type] = false; } /* 'skb' is a HSR Ethernet frame (with a HSR tag inserted), with a valid * ethhdr->h_source address and skb->mac_header set. * * Return: * 1 if frame can be shown to have been sent recently on this interface, * 0 otherwise, or * negative error code on error */ int hsr_register_frame_out(struct hsr_port *port, struct hsr_node *node, u16 sequence_nr) { spin_lock_bh(&node->seq_out_lock); if (seq_nr_before_or_eq(sequence_nr, node->seq_out[port->type]) && time_is_after_jiffies(node->time_out[port->type] + msecs_to_jiffies(HSR_ENTRY_FORGET_TIME))) { spin_unlock_bh(&node->seq_out_lock); return 1; } node->time_out[port->type] = jiffies; node->seq_out[port->type] = sequence_nr; spin_unlock_bh(&node->seq_out_lock); return 0; } static struct hsr_port *get_late_port(struct hsr_priv *hsr, struct hsr_node *node) { if (node->time_in_stale[HSR_PT_SLAVE_A]) return hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A); if (node->time_in_stale[HSR_PT_SLAVE_B]) return hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B); if (time_after(node->time_in[HSR_PT_SLAVE_B], node->time_in[HSR_PT_SLAVE_A] + msecs_to_jiffies(MAX_SLAVE_DIFF))) return hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A); if (time_after(node->time_in[HSR_PT_SLAVE_A], node->time_in[HSR_PT_SLAVE_B] + msecs_to_jiffies(MAX_SLAVE_DIFF))) return hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B); return NULL; } /* Remove stale sequence_nr records. Called by timer every * HSR_LIFE_CHECK_INTERVAL (two seconds or so). */ void hsr_prune_nodes(struct timer_list *t) { struct hsr_priv *hsr = from_timer(hsr, t, prune_timer); struct hsr_node *node; struct hsr_node *tmp; struct hsr_port *port; unsigned long timestamp; unsigned long time_a, time_b; spin_lock_bh(&hsr->list_lock); list_for_each_entry_safe(node, tmp, &hsr->node_db, mac_list) { /* Don't prune own node. Neither time_in[HSR_PT_SLAVE_A] * nor time_in[HSR_PT_SLAVE_B], will ever be updated for * the master port. Thus the master node will be repeatedly * pruned leading to packet loss. */ if (hsr_addr_is_self(hsr, node->macaddress_A)) continue; /* Shorthand */ time_a = node->time_in[HSR_PT_SLAVE_A]; time_b = node->time_in[HSR_PT_SLAVE_B]; /* Check for timestamps old enough to risk wrap-around */ if (time_after(jiffies, time_a + MAX_JIFFY_OFFSET / 2)) node->time_in_stale[HSR_PT_SLAVE_A] = true; if (time_after(jiffies, time_b + MAX_JIFFY_OFFSET / 2)) node->time_in_stale[HSR_PT_SLAVE_B] = true; /* Get age of newest frame from node. * At least one time_in is OK here; nodes get pruned long * before both time_ins can get stale */ timestamp = time_a; if (node->time_in_stale[HSR_PT_SLAVE_A] || (!node->time_in_stale[HSR_PT_SLAVE_B] && time_after(time_b, time_a))) timestamp = time_b; /* Warn of ring error only as long as we get frames at all */ if (time_is_after_jiffies(timestamp + msecs_to_jiffies(1.5 * MAX_SLAVE_DIFF))) { rcu_read_lock(); port = get_late_port(hsr, node); if (port) hsr_nl_ringerror(hsr, node->macaddress_A, port); rcu_read_unlock(); } /* Prune old entries */ if (time_is_before_jiffies(timestamp + msecs_to_jiffies(HSR_NODE_FORGET_TIME))) { hsr_nl_nodedown(hsr, node->macaddress_A); if (!node->removed) { list_del_rcu(&node->mac_list); node->removed = true; /* Note that we need to free this entry later: */ kfree_rcu(node, rcu_head); } } } spin_unlock_bh(&hsr->list_lock); /* Restart timer */ mod_timer(&hsr->prune_timer, jiffies + msecs_to_jiffies(PRUNE_PERIOD)); } void *hsr_get_next_node(struct hsr_priv *hsr, void *_pos, unsigned char addr[ETH_ALEN]) { struct hsr_node *node; if (!_pos) { node = list_first_or_null_rcu(&hsr->node_db, struct hsr_node, mac_list); if (node) ether_addr_copy(addr, node->macaddress_A); return node; } node = _pos; list_for_each_entry_continue_rcu(node, &hsr->node_db, mac_list) { ether_addr_copy(addr, node->macaddress_A); return node; } return NULL; } int hsr_get_node_data(struct hsr_priv *hsr, const unsigned char *addr, unsigned char addr_b[ETH_ALEN], unsigned int *addr_b_ifindex, int *if1_age, u16 *if1_seq, int *if2_age, u16 *if2_seq) { struct hsr_node *node; struct hsr_port *port; unsigned long tdiff; node = find_node_by_addr_A(&hsr->node_db, addr); if (!node) return -ENOENT; ether_addr_copy(addr_b, node->macaddress_B); tdiff = jiffies - node->time_in[HSR_PT_SLAVE_A]; if (node->time_in_stale[HSR_PT_SLAVE_A]) *if1_age = INT_MAX; #if HZ <= MSEC_PER_SEC else if (tdiff > msecs_to_jiffies(INT_MAX)) *if1_age = INT_MAX; #endif else *if1_age = jiffies_to_msecs(tdiff); tdiff = jiffies - node->time_in[HSR_PT_SLAVE_B]; if (node->time_in_stale[HSR_PT_SLAVE_B]) *if2_age = INT_MAX; #if HZ <= MSEC_PER_SEC else if (tdiff > msecs_to_jiffies(INT_MAX)) *if2_age = INT_MAX; #endif else *if2_age = jiffies_to_msecs(tdiff); /* Present sequence numbers as if they were incoming on interface */ *if1_seq = node->seq_out[HSR_PT_SLAVE_B]; *if2_seq = node->seq_out[HSR_PT_SLAVE_A]; if (node->addr_B_port != HSR_PT_NONE) { port = hsr_port_get_hsr(hsr, node->addr_B_port); *addr_b_ifindex = port->dev->ifindex; } else { *addr_b_ifindex = -1; } return 0; } |
3157 475 2082 1612 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 | #ifndef _LINUX_HASH_H #define _LINUX_HASH_H /* Fast hashing routine for ints, longs and pointers. (C) 2002 Nadia Yvette Chambers, IBM */ #include <asm/types.h> #include <linux/compiler.h> /* * The "GOLDEN_RATIO_PRIME" is used in ifs/btrfs/brtfs_inode.h and * fs/inode.c. It's not actually prime any more (the previous primes * were actively bad for hashing), but the name remains. */ #if BITS_PER_LONG == 32 #define GOLDEN_RATIO_PRIME GOLDEN_RATIO_32 #define hash_long(val, bits) hash_32(val, bits) #elif BITS_PER_LONG == 64 #define hash_long(val, bits) hash_64(val, bits) #define GOLDEN_RATIO_PRIME GOLDEN_RATIO_64 #else #error Wordsize not 32 or 64 #endif /* * This hash multiplies the input by a large odd number and takes the * high bits. Since multiplication propagates changes to the most * significant end only, it is essential that the high bits of the * product be used for the hash value. * * Chuck Lever verified the effectiveness of this technique: * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf * * Although a random odd number will do, it turns out that the golden * ratio phi = (sqrt(5)-1)/2, or its negative, has particularly nice * properties. (See Knuth vol 3, section 6.4, exercise 9.) * * These are the negative, (1 - phi) = phi**2 = (3 - sqrt(5))/2, * which is very slightly easier to multiply by and makes no * difference to the hash distribution. */ #define GOLDEN_RATIO_32 0x61C88647 #define GOLDEN_RATIO_64 0x61C8864680B583EBull #ifdef CONFIG_HAVE_ARCH_HASH /* This header may use the GOLDEN_RATIO_xx constants */ #include <asm/hash.h> #endif /* * The _generic versions exist only so lib/test_hash.c can compare * the arch-optimized versions with the generic. * * Note that if you change these, any <asm/hash.h> that aren't updated * to match need to have their HAVE_ARCH_* define values updated so the * self-test will not false-positive. */ #ifndef HAVE_ARCH__HASH_32 #define __hash_32 __hash_32_generic #endif static inline u32 __hash_32_generic(u32 val) { return val * GOLDEN_RATIO_32; } static inline u32 hash_32(u32 val, unsigned int bits) { /* High bits are more random, so use them. */ return __hash_32(val) >> (32 - bits); } #ifndef HAVE_ARCH_HASH_64 #define hash_64 hash_64_generic #endif static __always_inline u32 hash_64_generic(u64 val, unsigned int bits) { #if BITS_PER_LONG == 64 /* 64x64-bit multiply is efficient on all 64-bit processors */ return val * GOLDEN_RATIO_64 >> (64 - bits); #else /* Hash 64 bits using only 32x32-bit multiply. */ return hash_32((u32)val ^ __hash_32(val >> 32), bits); #endif } static inline u32 hash_ptr(const void *ptr, unsigned int bits) { return hash_long((unsigned long)ptr, bits); } /* This really should be called fold32_ptr; it does no hashing to speak of. */ static inline u32 hash32_ptr(const void *ptr) { unsigned long val = (unsigned long)ptr; #if BITS_PER_LONG == 64 val ^= (val >> 32); #endif return (u32)val; } #endif /* _LINUX_HASH_H */ |
4 4 4 4 4 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Sysfs attributes of bridge ports * Linux ethernet bridge * * Authors: * Stephen Hemminger <shemminger@osdl.org> */ #include <linux/capability.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/if_bridge.h> #include <linux/rtnetlink.h> #include <linux/spinlock.h> #include <linux/sched/signal.h> #include "br_private.h" /* IMPORTANT: new bridge port options must be added with netlink support only * please do not add new sysfs entries */ struct brport_attribute { struct attribute attr; ssize_t (*show)(struct net_bridge_port *, char *); int (*store)(struct net_bridge_port *, unsigned long); int (*store_raw)(struct net_bridge_port *, char *); }; #define BRPORT_ATTR_RAW(_name, _mode, _show, _store) \ const struct brport_attribute brport_attr_##_name = { \ .attr = {.name = __stringify(_name), \ .mode = _mode }, \ .show = _show, \ .store_raw = _store, \ }; #define BRPORT_ATTR(_name, _mode, _show, _store) \ const struct brport_attribute brport_attr_##_name = { \ .attr = {.name = __stringify(_name), \ .mode = _mode }, \ .show = _show, \ .store = _store, \ }; #define BRPORT_ATTR_FLAG(_name, _mask) \ static ssize_t show_##_name(struct net_bridge_port *p, char *buf) \ { \ return sprintf(buf, "%d\n", !!(p->flags & _mask)); \ } \ static int store_##_name(struct net_bridge_port *p, unsigned long v) \ { \ return store_flag(p, v, _mask); \ } \ static BRPORT_ATTR(_name, 0644, \ show_##_name, store_##_name) static int store_flag(struct net_bridge_port *p, unsigned long v, unsigned long mask) { struct netlink_ext_ack extack = {0}; unsigned long flags = p->flags; int err; if (v) flags |= mask; else flags &= ~mask; if (flags != p->flags) { err = br_switchdev_set_port_flag(p, flags, mask, &extack); if (err) { netdev_err(p->dev, "%s\n", extack._msg); return err; } p->flags = flags; br_port_flags_change(p, mask); } return 0; } static ssize_t show_path_cost(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%d\n", p->path_cost); } static BRPORT_ATTR(path_cost, 0644, show_path_cost, br_stp_set_path_cost); static ssize_t show_priority(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%d\n", p->priority); } static BRPORT_ATTR(priority, 0644, show_priority, br_stp_set_port_priority); static ssize_t show_designated_root(struct net_bridge_port *p, char *buf) { return br_show_bridge_id(buf, &p->designated_root); } static BRPORT_ATTR(designated_root, 0444, show_designated_root, NULL); static ssize_t show_designated_bridge(struct net_bridge_port *p, char *buf) { return br_show_bridge_id(buf, &p->designated_bridge); } static BRPORT_ATTR(designated_bridge, 0444, show_designated_bridge, NULL); static ssize_t show_designated_port(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%d\n", p->designated_port); } static BRPORT_ATTR(designated_port, 0444, show_designated_port, NULL); static ssize_t show_designated_cost(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%d\n", p->designated_cost); } static BRPORT_ATTR(designated_cost, 0444, show_designated_cost, NULL); static ssize_t show_port_id(struct net_bridge_port *p, char *buf) { return sprintf(buf, "0x%x\n", p->port_id); } static BRPORT_ATTR(port_id, 0444, show_port_id, NULL); static ssize_t show_port_no(struct net_bridge_port *p, char *buf) { return sprintf(buf, "0x%x\n", p->port_no); } static BRPORT_ATTR(port_no, 0444, show_port_no, NULL); static ssize_t show_change_ack(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%d\n", p->topology_change_ack); } static BRPORT_ATTR(change_ack, 0444, show_change_ack, NULL); static ssize_t show_config_pending(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%d\n", p->config_pending); } static BRPORT_ATTR(config_pending, 0444, show_config_pending, NULL); static ssize_t show_port_state(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%d\n", p->state); } static BRPORT_ATTR(state, 0444, show_port_state, NULL); static ssize_t show_message_age_timer(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%ld\n", br_timer_value(&p->message_age_timer)); } static BRPORT_ATTR(message_age_timer, 0444, show_message_age_timer, NULL); static ssize_t show_forward_delay_timer(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%ld\n", br_timer_value(&p->forward_delay_timer)); } static BRPORT_ATTR(forward_delay_timer, 0444, show_forward_delay_timer, NULL); static ssize_t show_hold_timer(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%ld\n", br_timer_value(&p->hold_timer)); } static BRPORT_ATTR(hold_timer, 0444, show_hold_timer, NULL); static int store_flush(struct net_bridge_port *p, unsigned long v) { br_fdb_delete_by_port(p->br, p, 0, 0); // Don't delete local entry return 0; } static BRPORT_ATTR(flush, 0200, NULL, store_flush); static ssize_t show_group_fwd_mask(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%#x\n", p->group_fwd_mask); } static int store_group_fwd_mask(struct net_bridge_port *p, unsigned long v) { if (v & BR_GROUPFWD_MACPAUSE) return -EINVAL; p->group_fwd_mask = v; return 0; } static BRPORT_ATTR(group_fwd_mask, 0644, show_group_fwd_mask, store_group_fwd_mask); static ssize_t show_backup_port(struct net_bridge_port *p, char *buf) { struct net_bridge_port *backup_p; int ret = 0; rcu_read_lock(); backup_p = rcu_dereference(p->backup_port); if (backup_p) ret = sprintf(buf, "%s\n", backup_p->dev->name); rcu_read_unlock(); return ret; } static int store_backup_port(struct net_bridge_port *p, char *buf) { struct net_device *backup_dev = NULL; char *nl = strchr(buf, '\n'); if (nl) *nl = '\0'; if (strlen(buf) > 0) { backup_dev = __dev_get_by_name(dev_net(p->dev), buf); if (!backup_dev) return -ENOENT; } return nbp_backup_change(p, backup_dev); } static BRPORT_ATTR_RAW(backup_port, 0644, show_backup_port, store_backup_port); BRPORT_ATTR_FLAG(hairpin_mode, BR_HAIRPIN_MODE); BRPORT_ATTR_FLAG(bpdu_guard, BR_BPDU_GUARD); BRPORT_ATTR_FLAG(root_block, BR_ROOT_BLOCK); BRPORT_ATTR_FLAG(learning, BR_LEARNING); BRPORT_ATTR_FLAG(unicast_flood, BR_FLOOD); BRPORT_ATTR_FLAG(proxyarp, BR_PROXYARP); BRPORT_ATTR_FLAG(proxyarp_wifi, BR_PROXYARP_WIFI); BRPORT_ATTR_FLAG(multicast_flood, BR_MCAST_FLOOD); BRPORT_ATTR_FLAG(broadcast_flood, BR_BCAST_FLOOD); BRPORT_ATTR_FLAG(neigh_suppress, BR_NEIGH_SUPPRESS); BRPORT_ATTR_FLAG(isolated, BR_ISOLATED); #ifdef CONFIG_BRIDGE_IGMP_SNOOPING static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%d\n", p->multicast_ctx.multicast_router); } static int store_multicast_router(struct net_bridge_port *p, unsigned long v) { return br_multicast_set_port_router(&p->multicast_ctx, v); } static BRPORT_ATTR(multicast_router, 0644, show_multicast_router, store_multicast_router); BRPORT_ATTR_FLAG(multicast_fast_leave, BR_MULTICAST_FAST_LEAVE); BRPORT_ATTR_FLAG(multicast_to_unicast, BR_MULTICAST_TO_UNICAST); #endif static const struct brport_attribute *brport_attrs[] = { &brport_attr_path_cost, &brport_attr_priority, &brport_attr_port_id, &brport_attr_port_no, &brport_attr_designated_root, &brport_attr_designated_bridge, &brport_attr_designated_port, &brport_attr_designated_cost, &brport_attr_state, &brport_attr_change_ack, &brport_attr_config_pending, &brport_attr_message_age_timer, &brport_attr_forward_delay_timer, &brport_attr_hold_timer, &brport_attr_flush, &brport_attr_hairpin_mode, &brport_attr_bpdu_guard, &brport_attr_root_block, &brport_attr_learning, &brport_attr_unicast_flood, #ifdef CONFIG_BRIDGE_IGMP_SNOOPING &brport_attr_multicast_router, &brport_attr_multicast_fast_leave, &brport_attr_multicast_to_unicast, #endif &brport_attr_proxyarp, &brport_attr_proxyarp_wifi, &brport_attr_multicast_flood, &brport_attr_broadcast_flood, &brport_attr_group_fwd_mask, &brport_attr_neigh_suppress, &brport_attr_isolated, &brport_attr_backup_port, NULL }; #define to_brport_attr(_at) container_of(_at, struct brport_attribute, attr) static ssize_t brport_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct brport_attribute *brport_attr = to_brport_attr(attr); struct net_bridge_port *p = kobj_to_brport(kobj); if (!brport_attr->show) return -EINVAL; return brport_attr->show(p, buf); } static ssize_t brport_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { struct brport_attribute *brport_attr = to_brport_attr(attr); struct net_bridge_port *p = kobj_to_brport(kobj); ssize_t ret = -EINVAL; unsigned long val; char *endp; if (!ns_capable(dev_net(p->dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (!rtnl_trylock()) return restart_syscall(); if (brport_attr->store_raw) { char *buf_copy; buf_copy = kstrndup(buf, count, GFP_KERNEL); if (!buf_copy) { ret = -ENOMEM; goto out_unlock; } spin_lock_bh(&p->br->lock); ret = brport_attr->store_raw(p, buf_copy); spin_unlock_bh(&p->br->lock); kfree(buf_copy); } else if (brport_attr->store) { val = simple_strtoul(buf, &endp, 0); if (endp == buf) goto out_unlock; spin_lock_bh(&p->br->lock); ret = brport_attr->store(p, val); spin_unlock_bh(&p->br->lock); } if (!ret) { br_ifinfo_notify(RTM_NEWLINK, NULL, p); ret = count; } out_unlock: rtnl_unlock(); return ret; } const struct sysfs_ops brport_sysfs_ops = { .show = brport_show, .store = brport_store, }; /* * Add sysfs entries to ethernet device added to a bridge. * Creates a brport subdirectory with bridge attributes. * Puts symlink in bridge's brif subdirectory */ int br_sysfs_addif(struct net_bridge_port *p) { struct net_bridge *br = p->br; const struct brport_attribute **a; int err; err = sysfs_create_link(&p->kobj, &br->dev->dev.kobj, SYSFS_BRIDGE_PORT_LINK); if (err) return err; for (a = brport_attrs; *a; ++a) { err = sysfs_create_file(&p->kobj, &((*a)->attr)); if (err) return err; } strscpy(p->sysfs_name, p->dev->name, IFNAMSIZ); return sysfs_create_link(br->ifobj, &p->kobj, p->sysfs_name); } /* Rename bridge's brif symlink */ int br_sysfs_renameif(struct net_bridge_port *p) { struct net_bridge *br = p->br; int err; /* If a rename fails, the rollback will cause another * rename call with the existing name. */ if (!strncmp(p->sysfs_name, p->dev->name, IFNAMSIZ)) return 0; err = sysfs_rename_link(br->ifobj, &p->kobj, p->sysfs_name, p->dev->name); if (err) netdev_notice(br->dev, "unable to rename link %s to %s", p->sysfs_name, p->dev->name); else strscpy(p->sysfs_name, p->dev->name, IFNAMSIZ); return err; } |
20327 3715 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Prevent the compiler from merging or refetching reads or writes. The * compiler is also forbidden from reordering successive instances of * READ_ONCE and WRITE_ONCE, but only when the compiler is aware of some * particular ordering. One way to make the compiler aware of ordering is to * put the two invocations of READ_ONCE or WRITE_ONCE in different C * statements. * * These two macros will also work on aggregate data types like structs or * unions. * * Their two major use cases are: (1) Mediating communication between * process-level code and irq/NMI handlers, all running on the same CPU, * and (2) Ensuring that the compiler does not fold, spindle, or otherwise * mutilate accesses that either do not require ordering or that interact * with an explicit memory barrier or atomic instruction that provides the * required ordering. */ #ifndef __ASM_GENERIC_RWONCE_H #define __ASM_GENERIC_RWONCE_H #ifndef __ASSEMBLY__ #include <linux/compiler_types.h> #include <linux/kasan-checks.h> #include <linux/kcsan-checks.h> /* * Yes, this permits 64-bit accesses on 32-bit architectures. These will * actually be atomic in some cases (namely Armv7 + LPAE), but for others we * rely on the access being split into 2x32-bit accesses for a 32-bit quantity * (e.g. a virtual address) and a strong prevailing wind. */ #define compiletime_assert_rwonce_type(t) \ compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long), \ "Unsupported access size for {READ,WRITE}_ONCE().") /* * Use __READ_ONCE() instead of READ_ONCE() if you do not require any * atomicity. Note that this may result in tears! */ #ifndef __READ_ONCE #define __READ_ONCE(x) (*(const volatile __unqual_scalar_typeof(x) *)&(x)) #endif #define READ_ONCE(x) \ ({ \ compiletime_assert_rwonce_type(x); \ __READ_ONCE(x); \ }) #define __WRITE_ONCE(x, val) \ do { \ *(volatile typeof(x) *)&(x) = (val); \ } while (0) #define WRITE_ONCE(x, val) \ do { \ compiletime_assert_rwonce_type(x); \ __WRITE_ONCE(x, val); \ } while (0) static __no_sanitize_or_inline unsigned long __read_once_word_nocheck(const void *addr) { return __READ_ONCE(*(unsigned long *)addr); } /* * Use READ_ONCE_NOCHECK() instead of READ_ONCE() if you need to load a * word from memory atomically but without telling KASAN/KCSAN. This is * usually used by unwinding code when walking the stack of a running process. */ #define READ_ONCE_NOCHECK(x) \ ({ \ compiletime_assert(sizeof(x) == sizeof(unsigned long), \ "Unsupported access size for READ_ONCE_NOCHECK()."); \ (typeof(x))__read_once_word_nocheck(&(x)); \ }) static __no_kasan_or_inline unsigned long read_word_at_a_time(const void *addr) { kasan_check_read(addr, 1); return *(unsigned long *)addr; } #endif /* __ASSEMBLY__ */ #endif /* __ASM_GENERIC_RWONCE_H */ |
2958 2958 2958 3267 3267 3267 1762 3261 2958 2958 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/init.h> #include <linux/scatterlist.h> #include <linux/mempool.h> #include <linux/slab.h> #define SG_MEMPOOL_NR ARRAY_SIZE(sg_pools) #define SG_MEMPOOL_SIZE 2 struct sg_pool { size_t size; char *name; struct kmem_cache *slab; mempool_t *pool; }; #define SP(x) { .size = x, "sgpool-" __stringify(x) } #if (SG_CHUNK_SIZE < 32) #error SG_CHUNK_SIZE is too small (must be 32 or greater) #endif static struct sg_pool sg_pools[] = { SP(8), SP(16), #if (SG_CHUNK_SIZE > 32) SP(32), #if (SG_CHUNK_SIZE > 64) SP(64), #if (SG_CHUNK_SIZE > 128) SP(128), #if (SG_CHUNK_SIZE > 256) #error SG_CHUNK_SIZE is too large (256 MAX) #endif #endif #endif #endif SP(SG_CHUNK_SIZE) }; #undef SP static inline unsigned int sg_pool_index(unsigned short nents) { unsigned int index; BUG_ON(nents > SG_CHUNK_SIZE); if (nents <= 8) index = 0; else index = get_count_order(nents) - 3; return index; } static void sg_pool_free(struct scatterlist *sgl, unsigned int nents) { struct sg_pool *sgp; sgp = sg_pools + sg_pool_index(nents); mempool_free(sgl, sgp->pool); } static struct scatterlist *sg_pool_alloc(unsigned int nents, gfp_t gfp_mask) { struct sg_pool *sgp; sgp = sg_pools + sg_pool_index(nents); return mempool_alloc(sgp->pool, gfp_mask); } /** * sg_free_table_chained - Free a previously mapped sg table * @table: The sg table header to use * @nents_first_chunk: size of the first_chunk SGL passed to * sg_alloc_table_chained * * Description: * Free an sg table previously allocated and setup with * sg_alloc_table_chained(). * * @nents_first_chunk has to be same with that same parameter passed * to sg_alloc_table_chained(). * **/ void sg_free_table_chained(struct sg_table *table, unsigned nents_first_chunk) { if (table->orig_nents <= nents_first_chunk) return; if (nents_first_chunk == 1) nents_first_chunk = 0; __sg_free_table(table, SG_CHUNK_SIZE, nents_first_chunk, sg_pool_free, table->orig_nents); } EXPORT_SYMBOL_GPL(sg_free_table_chained); /** * sg_alloc_table_chained - Allocate and chain SGLs in an sg table * @table: The sg table header to use * @nents: Number of entries in sg list * @first_chunk: first SGL * @nents_first_chunk: number of the SGL of @first_chunk * * Description: * Allocate and chain SGLs in an sg table. If @nents@ is larger than * @nents_first_chunk a chained sg table will be setup. @first_chunk is * ignored if nents_first_chunk <= 1 because user expects the SGL points * non-chain SGL. * **/ int sg_alloc_table_chained(struct sg_table *table, int nents, struct scatterlist *first_chunk, unsigned nents_first_chunk) { int ret; BUG_ON(!nents); if (first_chunk && nents_first_chunk) { if (nents <= nents_first_chunk) { table->nents = table->orig_nents = nents; sg_init_table(table->sgl, nents); return 0; } } /* User supposes that the 1st SGL includes real entry */ if (nents_first_chunk <= 1) { first_chunk = NULL; nents_first_chunk = 0; } ret = __sg_alloc_table(table, nents, SG_CHUNK_SIZE, first_chunk, nents_first_chunk, GFP_ATOMIC, sg_pool_alloc); if (unlikely(ret)) sg_free_table_chained(table, nents_first_chunk); return ret; } EXPORT_SYMBOL_GPL(sg_alloc_table_chained); static __init int sg_pool_init(void) { int i; for (i = 0; i < SG_MEMPOOL_NR; i++) { struct sg_pool *sgp = sg_pools + i; int size = sgp->size * sizeof(struct scatterlist); sgp->slab = kmem_cache_create(sgp->name, size, 0, SLAB_HWCACHE_ALIGN, NULL); if (!sgp->slab) { printk(KERN_ERR "SG_POOL: can't init sg slab %s\n", sgp->name); goto cleanup_sdb; } sgp->pool = mempool_create_slab_pool(SG_MEMPOOL_SIZE, sgp->slab); if (!sgp->pool) { printk(KERN_ERR "SG_POOL: can't init sg mempool %s\n", sgp->name); goto cleanup_sdb; } } return 0; cleanup_sdb: for (i = 0; i < SG_MEMPOOL_NR; i++) { struct sg_pool *sgp = sg_pools + i; mempool_destroy(sgp->pool); kmem_cache_destroy(sgp->slab); } return -ENOMEM; } subsys_initcall(sg_pool_init); |
143 2 137 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_HEX_H #define _LINUX_HEX_H #include <linux/types.h> extern const char hex_asc[]; #define hex_asc_lo(x) hex_asc[((x) & 0x0f)] #define hex_asc_hi(x) hex_asc[((x) & 0xf0) >> 4] static inline char *hex_byte_pack(char *buf, u8 byte) { *buf++ = hex_asc_hi(byte); *buf++ = hex_asc_lo(byte); return buf; } extern const char hex_asc_upper[]; #define hex_asc_upper_lo(x) hex_asc_upper[((x) & 0x0f)] #define hex_asc_upper_hi(x) hex_asc_upper[((x) & 0xf0) >> 4] static inline char *hex_byte_pack_upper(char *buf, u8 byte) { *buf++ = hex_asc_upper_hi(byte); *buf++ = hex_asc_upper_lo(byte); return buf; } extern int hex_to_bin(unsigned char ch); extern int __must_check hex2bin(u8 *dst, const char *src, size_t count); extern char *bin2hex(char *dst, const void *src, size_t count); bool mac_pton(const char *s, u8 *mac); #endif |
341 342 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2013 Intel Corporation * * Author: * Dmitry Kasatkin <dmitry.kasatkin@intel.com> */ #include <linux/err.h> #include <linux/ratelimit.h> #include <linux/key-type.h> #include <crypto/public_key.h> #include <crypto/hash_info.h> #include <keys/asymmetric-type.h> #include <keys/system_keyring.h> #include "integrity.h" /* * Request an asymmetric key. */ static struct key *request_asymmetric_key(struct key *keyring, uint32_t keyid) { struct key *key; char name[12]; sprintf(name, "id:%08x", keyid); pr_debug("key search: \"%s\"\n", name); key = get_ima_blacklist_keyring(); if (key) { key_ref_t kref; kref = keyring_search(make_key_ref(key, 1), &key_type_asymmetric, name, true); if (!IS_ERR(kref)) { pr_err("Key '%s' is in ima_blacklist_keyring\n", name); return ERR_PTR(-EKEYREJECTED); } } if (keyring) { /* search in specific keyring */ key_ref_t kref; kref = keyring_search(make_key_ref(keyring, 1), &key_type_asymmetric, name, true); if (IS_ERR(kref)) key = ERR_CAST(kref); else key = key_ref_to_ptr(kref); } else { key = request_key(&key_type_asymmetric, name, NULL); } if (IS_ERR(key)) { if (keyring) pr_err_ratelimited("Request for unknown key '%s' in '%s' keyring. err %ld\n", name, keyring->description, PTR_ERR(key)); else pr_err_ratelimited("Request for unknown key '%s' err %ld\n", name, PTR_ERR(key)); switch (PTR_ERR(key)) { /* Hide some search errors */ case -EACCES: case -ENOTDIR: case -EAGAIN: return ERR_PTR(-ENOKEY); default: return key; } } pr_debug("%s() = 0 [%x]\n", __func__, key_serial(key)); return key; } int asymmetric_verify(struct key *keyring, const char *sig, int siglen, const char *data, int datalen) { struct public_key_signature pks; struct signature_v2_hdr *hdr = (struct signature_v2_hdr *)sig; const struct public_key *pk; struct key *key; int ret; if (siglen <= sizeof(*hdr)) return -EBADMSG; siglen -= sizeof(*hdr); if (siglen != be16_to_cpu(hdr->sig_size)) return -EBADMSG; if (hdr->hash_algo >= HASH_ALGO__LAST) return -ENOPKG; key = request_asymmetric_key(keyring, be32_to_cpu(hdr->keyid)); if (IS_ERR(key)) return PTR_ERR(key); memset(&pks, 0, sizeof(pks)); pks.hash_algo = hash_algo_name[hdr->hash_algo]; pk = asymmetric_key_public_key(key); pks.pkey_algo = pk->pkey_algo; if (!strcmp(pk->pkey_algo, "rsa")) { pks.encoding = "pkcs1"; } else if (!strncmp(pk->pkey_algo, "ecdsa-", 6)) { /* edcsa-nist-p192 etc. */ pks.encoding = "x962"; } else if (!strcmp(pk->pkey_algo, "ecrdsa") || !strcmp(pk->pkey_algo, "sm2")) { pks.encoding = "raw"; } else { ret = -ENOPKG; goto out; } pks.digest = (u8 *)data; pks.digest_size = datalen; pks.s = hdr->sig; pks.s_size = siglen; ret = verify_signature(key, &pks); out: key_put(key); pr_debug("%s() = %d\n", __func__, ret); return ret; } /** * integrity_kernel_module_request - prevent crypto-pkcs1pad(rsa,*) requests * @kmod_name: kernel module name * * We have situation, when public_key_verify_signature() in case of RSA * algorithm use alg_name to store internal information in order to * construct an algorithm on the fly, but crypto_larval_lookup() will try * to use alg_name in order to load kernel module with same name. * Since we don't have any real "crypto-pkcs1pad(rsa,*)" kernel modules, * we are safe to fail such module request from crypto_larval_lookup(). * * In this way we prevent modprobe execution during digsig verification * and avoid possible deadlock if modprobe and/or it's dependencies * also signed with digsig. */ int integrity_kernel_module_request(char *kmod_name) { if (strncmp(kmod_name, "crypto-pkcs1pad(rsa,", 20) == 0) return -EINVAL; return 0; } |
2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 | // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2010-2011 EIA Electronics, // Pieter Beyens <pieter.beyens@eia.be> // Copyright (c) 2010-2011 EIA Electronics, // Kurt Van Dijck <kurt.van.dijck@eia.be> // Copyright (c) 2018 Protonic, // Robin van der Gracht <robin@protonic.nl> // Copyright (c) 2017-2019 Pengutronix, // Marc Kleine-Budde <kernel@pengutronix.de> // Copyright (c) 2017-2019 Pengutronix, // Oleksij Rempel <kernel@pengutronix.de> #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/can/can-ml.h> #include <linux/can/core.h> #include <linux/can/skb.h> #include <linux/errqueue.h> #include <linux/if_arp.h> #include "j1939-priv.h" #define J1939_MIN_NAMELEN CAN_REQUIRED_SIZE(struct sockaddr_can, can_addr.j1939) /* conversion function between struct sock::sk_priority from linux and * j1939 priority field */ static inline priority_t j1939_prio(u32 sk_priority) { sk_priority = min(sk_priority, 7U); return 7 - sk_priority; } static inline u32 j1939_to_sk_priority(priority_t prio) { return 7 - prio; } /* function to see if pgn is to be evaluated */ static inline bool j1939_pgn_is_valid(pgn_t pgn) { return pgn <= J1939_PGN_MAX; } /* test function to avoid non-zero DA placeholder for pdu1 pgn's */ static inline bool j1939_pgn_is_clean_pdu(pgn_t pgn) { if (j1939_pgn_is_pdu1(pgn)) return !(pgn & 0xff); else return true; } static inline void j1939_sock_pending_add(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); atomic_inc(&jsk->skb_pending); } static int j1939_sock_pending_get(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); return atomic_read(&jsk->skb_pending); } void j1939_sock_pending_del(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); /* atomic_dec_return returns the new value */ if (!atomic_dec_return(&jsk->skb_pending)) wake_up(&jsk->waitq); /* no pending SKB's */ } static void j1939_jsk_add(struct j1939_priv *priv, struct j1939_sock *jsk) { jsk->state |= J1939_SOCK_BOUND; j1939_priv_get(priv); spin_lock_bh(&priv->j1939_socks_lock); list_add_tail(&jsk->list, &priv->j1939_socks); spin_unlock_bh(&priv->j1939_socks_lock); } static void j1939_jsk_del(struct j1939_priv *priv, struct j1939_sock *jsk) { spin_lock_bh(&priv->j1939_socks_lock); list_del_init(&jsk->list); spin_unlock_bh(&priv->j1939_socks_lock); j1939_priv_put(priv); jsk->state &= ~J1939_SOCK_BOUND; } static bool j1939_sk_queue_session(struct j1939_session *session) { struct j1939_sock *jsk = j1939_sk(session->sk); bool empty; spin_lock_bh(&jsk->sk_session_queue_lock); empty = list_empty(&jsk->sk_session_queue); j1939_session_get(session); list_add_tail(&session->sk_session_queue_entry, &jsk->sk_session_queue); spin_unlock_bh(&jsk->sk_session_queue_lock); j1939_sock_pending_add(&jsk->sk); return empty; } static struct j1939_session *j1939_sk_get_incomplete_session(struct j1939_sock *jsk) { struct j1939_session *session = NULL; spin_lock_bh(&jsk->sk_session_queue_lock); if (!list_empty(&jsk->sk_session_queue)) { session = list_last_entry(&jsk->sk_session_queue, struct j1939_session, sk_session_queue_entry); if (session->total_queued_size == session->total_message_size) session = NULL; else j1939_session_get(session); } spin_unlock_bh(&jsk->sk_session_queue_lock); return session; } static void j1939_sk_queue_drop_all(struct j1939_priv *priv, struct j1939_sock *jsk, int err) { struct j1939_session *session, *tmp; netdev_dbg(priv->ndev, "%s: err: %i\n", __func__, err); spin_lock_bh(&jsk->sk_session_queue_lock); list_for_each_entry_safe(session, tmp, &jsk->sk_session_queue, sk_session_queue_entry) { list_del_init(&session->sk_session_queue_entry); session->err = err; j1939_session_put(session); } spin_unlock_bh(&jsk->sk_session_queue_lock); } static void j1939_sk_queue_activate_next_locked(struct j1939_session *session) { struct j1939_sock *jsk; struct j1939_session *first; int err; /* RX-Session don't have a socket (yet) */ if (!session->sk) return; jsk = j1939_sk(session->sk); lockdep_assert_held(&jsk->sk_session_queue_lock); err = session->err; first = list_first_entry_or_null(&jsk->sk_session_queue, struct j1939_session, sk_session_queue_entry); /* Some else has already activated the next session */ if (first != session) return; activate_next: list_del_init(&first->sk_session_queue_entry); j1939_session_put(first); first = list_first_entry_or_null(&jsk->sk_session_queue, struct j1939_session, sk_session_queue_entry); if (!first) return; if (j1939_session_activate(first)) { netdev_warn_once(first->priv->ndev, "%s: 0x%p: Identical session is already activated.\n", __func__, first); first->err = -EBUSY; goto activate_next; } else { /* Give receiver some time (arbitrary chosen) to recover */ int time_ms = 0; if (err) time_ms = 10 + get_random_u32_below(16); j1939_tp_schedule_txtimer(first, time_ms); } } void j1939_sk_queue_activate_next(struct j1939_session *session) { struct j1939_sock *jsk; if (!session->sk) return; jsk = j1939_sk(session->sk); spin_lock_bh(&jsk->sk_session_queue_lock); j1939_sk_queue_activate_next_locked(session); spin_unlock_bh(&jsk->sk_session_queue_lock); } static bool j1939_sk_match_dst(struct j1939_sock *jsk, const struct j1939_sk_buff_cb *skcb) { if ((jsk->state & J1939_SOCK_PROMISC)) return true; /* Destination address filter */ if (jsk->addr.src_name && skcb->addr.dst_name) { if (jsk->addr.src_name != skcb->addr.dst_name) return false; } else { /* receive (all sockets) if * - all packages that match our bind() address * - all broadcast on a socket if SO_BROADCAST * is set */ if (j1939_address_is_unicast(skcb->addr.da)) { if (jsk->addr.sa != skcb->addr.da) return false; } else if (!sock_flag(&jsk->sk, SOCK_BROADCAST)) { /* receiving broadcast without SO_BROADCAST * flag is not allowed */ return false; } } /* Source address filter */ if (jsk->state & J1939_SOCK_CONNECTED) { /* receive (all sockets) if * - all packages that match our connect() name or address */ if (jsk->addr.dst_name && skcb->addr.src_name) { if (jsk->addr.dst_name != skcb->addr.src_name) return false; } else { if (jsk->addr.da != skcb->addr.sa) return false; } } /* PGN filter */ if (j1939_pgn_is_valid(jsk->pgn_rx_filter) && jsk->pgn_rx_filter != skcb->addr.pgn) return false; return true; } /* matches skb control buffer (addr) with a j1939 filter */ static bool j1939_sk_match_filter(struct j1939_sock *jsk, const struct j1939_sk_buff_cb *skcb) { const struct j1939_filter *f = jsk->filters; int nfilter = jsk->nfilters; if (!nfilter) /* receive all when no filters are assigned */ return true; for (; nfilter; ++f, --nfilter) { if ((skcb->addr.pgn & f->pgn_mask) != f->pgn) continue; if ((skcb->addr.sa & f->addr_mask) != f->addr) continue; if ((skcb->addr.src_name & f->name_mask) != f->name) continue; return true; } return false; } static bool j1939_sk_recv_match_one(struct j1939_sock *jsk, const struct j1939_sk_buff_cb *skcb) { if (!(jsk->state & J1939_SOCK_BOUND)) return false; if (!j1939_sk_match_dst(jsk, skcb)) return false; if (!j1939_sk_match_filter(jsk, skcb)) return false; return true; } static void j1939_sk_recv_one(struct j1939_sock *jsk, struct sk_buff *oskb) { const struct j1939_sk_buff_cb *oskcb = j1939_skb_to_cb(oskb); struct j1939_sk_buff_cb *skcb; struct sk_buff *skb; if (oskb->sk == &jsk->sk) return; if (!j1939_sk_recv_match_one(jsk, oskcb)) return; skb = skb_clone(oskb, GFP_ATOMIC); if (!skb) { pr_warn("skb clone failed\n"); return; } can_skb_set_owner(skb, oskb->sk); skcb = j1939_skb_to_cb(skb); skcb->msg_flags &= ~(MSG_DONTROUTE); if (skb->sk) skcb->msg_flags |= MSG_DONTROUTE; if (sock_queue_rcv_skb(&jsk->sk, skb) < 0) kfree_skb(skb); } bool j1939_sk_recv_match(struct j1939_priv *priv, struct j1939_sk_buff_cb *skcb) { struct j1939_sock *jsk; bool match = false; spin_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { match = j1939_sk_recv_match_one(jsk, skcb); if (match) break; } spin_unlock_bh(&priv->j1939_socks_lock); return match; } void j1939_sk_recv(struct j1939_priv *priv, struct sk_buff *skb) { struct j1939_sock *jsk; spin_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { j1939_sk_recv_one(jsk, skb); } spin_unlock_bh(&priv->j1939_socks_lock); } static void j1939_sk_sock_destruct(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); /* This function will be called by the generic networking code, when * the socket is ultimately closed (sk->sk_destruct). * * The race between * - processing a received CAN frame * (can_receive -> j1939_can_recv) * and accessing j1939_priv * ... and ... * - closing a socket * (j1939_can_rx_unregister -> can_rx_unregister) * and calling the final j1939_priv_put() * * is avoided by calling the final j1939_priv_put() from this * RCU deferred cleanup call. */ if (jsk->priv) { j1939_priv_put(jsk->priv); jsk->priv = NULL; } /* call generic CAN sock destruct */ can_sock_destruct(sk); } static int j1939_sk_init(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); /* Ensure that "sk" is first member in "struct j1939_sock", so that we * can skip it during memset(). */ BUILD_BUG_ON(offsetof(struct j1939_sock, sk) != 0); memset((void *)jsk + sizeof(jsk->sk), 0x0, sizeof(*jsk) - sizeof(jsk->sk)); INIT_LIST_HEAD(&jsk->list); init_waitqueue_head(&jsk->waitq); jsk->sk.sk_priority = j1939_to_sk_priority(6); jsk->sk.sk_reuse = 1; /* per default */ jsk->addr.sa = J1939_NO_ADDR; jsk->addr.da = J1939_NO_ADDR; jsk->addr.pgn = J1939_NO_PGN; jsk->pgn_rx_filter = J1939_NO_PGN; atomic_set(&jsk->skb_pending, 0); spin_lock_init(&jsk->sk_session_queue_lock); INIT_LIST_HEAD(&jsk->sk_session_queue); /* j1939_sk_sock_destruct() depends on SOCK_RCU_FREE flag */ sock_set_flag(sk, SOCK_RCU_FREE); sk->sk_destruct = j1939_sk_sock_destruct; sk->sk_protocol = CAN_J1939; return 0; } static int j1939_sk_sanity_check(struct sockaddr_can *addr, int len) { if (!addr) return -EDESTADDRREQ; if (len < J1939_MIN_NAMELEN) return -EINVAL; if (addr->can_family != AF_CAN) return -EINVAL; if (!addr->can_ifindex) return -ENODEV; if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn) && !j1939_pgn_is_clean_pdu(addr->can_addr.j1939.pgn)) return -EINVAL; return 0; } static int j1939_sk_bind(struct socket *sock, struct sockaddr *uaddr, int len) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct j1939_sock *jsk = j1939_sk(sock->sk); struct j1939_priv *priv; struct sock *sk; struct net *net; int ret = 0; ret = j1939_sk_sanity_check(addr, len); if (ret) return ret; lock_sock(sock->sk); priv = jsk->priv; sk = sock->sk; net = sock_net(sk); /* Already bound to an interface? */ if (jsk->state & J1939_SOCK_BOUND) { /* A re-bind() to a different interface is not * supported. */ if (jsk->ifindex != addr->can_ifindex) { ret = -EINVAL; goto out_release_sock; } /* drop old references */ j1939_jsk_del(priv, jsk); j1939_local_ecu_put(priv, jsk->addr.src_name, jsk->addr.sa); } else { struct can_ml_priv *can_ml; struct net_device *ndev; ndev = dev_get_by_index(net, addr->can_ifindex); if (!ndev) { ret = -ENODEV; goto out_release_sock; } can_ml = can_get_ml_priv(ndev); if (!can_ml) { dev_put(ndev); ret = -ENODEV; goto out_release_sock; } if (!(ndev->flags & IFF_UP)) { dev_put(ndev); ret = -ENETDOWN; goto out_release_sock; } priv = j1939_netdev_start(ndev); dev_put(ndev); if (IS_ERR(priv)) { ret = PTR_ERR(priv); goto out_release_sock; } jsk->ifindex = addr->can_ifindex; /* the corresponding j1939_priv_put() is called via * sk->sk_destruct, which points to j1939_sk_sock_destruct() */ j1939_priv_get(priv); jsk->priv = priv; } /* set default transmit pgn */ if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn)) jsk->pgn_rx_filter = addr->can_addr.j1939.pgn; jsk->addr.src_name = addr->can_addr.j1939.name; jsk->addr.sa = addr->can_addr.j1939.addr; /* get new references */ ret = j1939_local_ecu_get(priv, jsk->addr.src_name, jsk->addr.sa); if (ret) { j1939_netdev_stop(priv); goto out_release_sock; } j1939_jsk_add(priv, jsk); out_release_sock: /* fall through */ release_sock(sock->sk); return ret; } static int j1939_sk_connect(struct socket *sock, struct sockaddr *uaddr, int len, int flags) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct j1939_sock *jsk = j1939_sk(sock->sk); int ret = 0; ret = j1939_sk_sanity_check(addr, len); if (ret) return ret; lock_sock(sock->sk); /* bind() before connect() is mandatory */ if (!(jsk->state & J1939_SOCK_BOUND)) { ret = -EINVAL; goto out_release_sock; } /* A connect() to a different interface is not supported. */ if (jsk->ifindex != addr->can_ifindex) { ret = -EINVAL; goto out_release_sock; } if (!addr->can_addr.j1939.name && addr->can_addr.j1939.addr == J1939_NO_ADDR && !sock_flag(&jsk->sk, SOCK_BROADCAST)) { /* broadcast, but SO_BROADCAST not set */ ret = -EACCES; goto out_release_sock; } jsk->addr.dst_name = addr->can_addr.j1939.name; jsk->addr.da = addr->can_addr.j1939.addr; if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn)) jsk->addr.pgn = addr->can_addr.j1939.pgn; jsk->state |= J1939_SOCK_CONNECTED; out_release_sock: /* fall through */ release_sock(sock->sk); return ret; } static void j1939_sk_sock2sockaddr_can(struct sockaddr_can *addr, const struct j1939_sock *jsk, int peer) { /* There are two holes (2 bytes and 3 bytes) to clear to avoid * leaking kernel information to user space. */ memset(addr, 0, J1939_MIN_NAMELEN); addr->can_family = AF_CAN; addr->can_ifindex = jsk->ifindex; addr->can_addr.j1939.pgn = jsk->addr.pgn; if (peer) { addr->can_addr.j1939.name = jsk->addr.dst_name; addr->can_addr.j1939.addr = jsk->addr.da; } else { addr->can_addr.j1939.name = jsk->addr.src_name; addr->can_addr.j1939.addr = jsk->addr.sa; } } static int j1939_sk_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; struct j1939_sock *jsk = j1939_sk(sk); int ret = 0; lock_sock(sk); if (peer && !(jsk->state & J1939_SOCK_CONNECTED)) { ret = -EADDRNOTAVAIL; goto failure; } j1939_sk_sock2sockaddr_can(addr, jsk, peer); ret = J1939_MIN_NAMELEN; failure: release_sock(sk); return ret; } static int j1939_sk_release(struct socket *sock) { struct sock *sk = sock->sk; struct j1939_sock *jsk; if (!sk) return 0; lock_sock(sk); jsk = j1939_sk(sk); if (jsk->state & J1939_SOCK_BOUND) { struct j1939_priv *priv = jsk->priv; if (wait_event_interruptible(jsk->waitq, !j1939_sock_pending_get(&jsk->sk))) { j1939_cancel_active_session(priv, sk); j1939_sk_queue_drop_all(priv, jsk, ESHUTDOWN); } j1939_jsk_del(priv, jsk); j1939_local_ecu_put(priv, jsk->addr.src_name, jsk->addr.sa); j1939_netdev_stop(priv); } kfree(jsk->filters); sock_orphan(sk); sock->sk = NULL; release_sock(sk); sock_put(sk); return 0; } static int j1939_sk_setsockopt_flag(struct j1939_sock *jsk, sockptr_t optval, unsigned int optlen, int flag) { int tmp; if (optlen != sizeof(tmp)) return -EINVAL; if (copy_from_sockptr(&tmp, optval, optlen)) return -EFAULT; lock_sock(&jsk->sk); if (tmp) jsk->state |= flag; else jsk->state &= ~flag; release_sock(&jsk->sk); return tmp; } static int j1939_sk_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct j1939_sock *jsk = j1939_sk(sk); int tmp, count = 0, ret = 0; struct j1939_filter *filters = NULL, *ofilters; if (level != SOL_CAN_J1939) return -EINVAL; switch (optname) { case SO_J1939_FILTER: if (!sockptr_is_null(optval) && optlen != 0) { struct j1939_filter *f; int c; if (optlen % sizeof(*filters) != 0) return -EINVAL; if (optlen > J1939_FILTER_MAX * sizeof(struct j1939_filter)) return -EINVAL; count = optlen / sizeof(*filters); filters = memdup_sockptr(optval, optlen); if (IS_ERR(filters)) return PTR_ERR(filters); for (f = filters, c = count; c; f++, c--) { f->name &= f->name_mask; f->pgn &= f->pgn_mask; f->addr &= f->addr_mask; } } lock_sock(&jsk->sk); ofilters = jsk->filters; jsk->filters = filters; jsk->nfilters = count; release_sock(&jsk->sk); kfree(ofilters); return 0; case SO_J1939_PROMISC: return j1939_sk_setsockopt_flag(jsk, optval, optlen, J1939_SOCK_PROMISC); case SO_J1939_ERRQUEUE: ret = j1939_sk_setsockopt_flag(jsk, optval, optlen, J1939_SOCK_ERRQUEUE); if (ret < 0) return ret; if (!(jsk->state & J1939_SOCK_ERRQUEUE)) skb_queue_purge(&sk->sk_error_queue); return ret; case SO_J1939_SEND_PRIO: if (optlen != sizeof(tmp)) return -EINVAL; if (copy_from_sockptr(&tmp, optval, optlen)) return -EFAULT; if (tmp < 0 || tmp > 7) return -EDOM; if (tmp < 2 && !capable(CAP_NET_ADMIN)) return -EPERM; lock_sock(&jsk->sk); jsk->sk.sk_priority = j1939_to_sk_priority(tmp); release_sock(&jsk->sk); return 0; default: return -ENOPROTOOPT; } } static int j1939_sk_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct j1939_sock *jsk = j1939_sk(sk); int ret, ulen; /* set defaults for using 'int' properties */ int tmp = 0; int len = sizeof(tmp); void *val = &tmp; if (level != SOL_CAN_J1939) return -EINVAL; if (get_user(ulen, optlen)) return -EFAULT; if (ulen < 0) return -EINVAL; lock_sock(&jsk->sk); switch (optname) { case SO_J1939_PROMISC: tmp = (jsk->state & J1939_SOCK_PROMISC) ? 1 : 0; break; case SO_J1939_ERRQUEUE: tmp = (jsk->state & J1939_SOCK_ERRQUEUE) ? 1 : 0; break; case SO_J1939_SEND_PRIO: tmp = j1939_prio(jsk->sk.sk_priority); break; default: ret = -ENOPROTOOPT; goto no_copy; } /* copy to user, based on 'len' & 'val' * but most sockopt's are 'int' properties, and have 'len' & 'val' * left unchanged, but instead modified 'tmp' */ if (len > ulen) ret = -EFAULT; else if (put_user(len, optlen)) ret = -EFAULT; else if (copy_to_user(optval, val, len)) ret = -EFAULT; else ret = 0; no_copy: release_sock(&jsk->sk); return ret; } static int j1939_sk_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; struct j1939_sk_buff_cb *skcb; int ret = 0; if (flags & ~(MSG_DONTWAIT | MSG_ERRQUEUE | MSG_CMSG_COMPAT)) return -EINVAL; if (flags & MSG_ERRQUEUE) return sock_recv_errqueue(sock->sk, msg, size, SOL_CAN_J1939, SCM_J1939_ERRQUEUE); skb = skb_recv_datagram(sk, flags, &ret); if (!skb) return ret; if (size < skb->len) msg->msg_flags |= MSG_TRUNC; else size = skb->len; ret = memcpy_to_msg(msg, skb->data, size); if (ret < 0) { skb_free_datagram(sk, skb); return ret; } skcb = j1939_skb_to_cb(skb); if (j1939_address_is_valid(skcb->addr.da)) put_cmsg(msg, SOL_CAN_J1939, SCM_J1939_DEST_ADDR, sizeof(skcb->addr.da), &skcb->addr.da); if (skcb->addr.dst_name) put_cmsg(msg, SOL_CAN_J1939, SCM_J1939_DEST_NAME, sizeof(skcb->addr.dst_name), &skcb->addr.dst_name); put_cmsg(msg, SOL_CAN_J1939, SCM_J1939_PRIO, sizeof(skcb->priority), &skcb->priority); if (msg->msg_name) { struct sockaddr_can *paddr = msg->msg_name; msg->msg_namelen = J1939_MIN_NAMELEN; memset(msg->msg_name, 0, msg->msg_namelen); paddr->can_family = AF_CAN; paddr->can_ifindex = skb->skb_iif; paddr->can_addr.j1939.name = skcb->addr.src_name; paddr->can_addr.j1939.addr = skcb->addr.sa; paddr->can_addr.j1939.pgn = skcb->addr.pgn; } sock_recv_cmsgs(msg, sk, skb); msg->msg_flags |= skcb->msg_flags; skb_free_datagram(sk, skb); return size; } static struct sk_buff *j1939_sk_alloc_skb(struct net_device *ndev, struct sock *sk, struct msghdr *msg, size_t size, int *errcode) { struct j1939_sock *jsk = j1939_sk(sk); struct j1939_sk_buff_cb *skcb; struct sk_buff *skb; int ret; skb = sock_alloc_send_skb(sk, size + sizeof(struct can_frame) - sizeof(((struct can_frame *)NULL)->data) + sizeof(struct can_skb_priv), msg->msg_flags & MSG_DONTWAIT, &ret); if (!skb) goto failure; can_skb_reserve(skb); can_skb_prv(skb)->ifindex = ndev->ifindex; can_skb_prv(skb)->skbcnt = 0; skb_reserve(skb, offsetof(struct can_frame, data)); ret = memcpy_from_msg(skb_put(skb, size), msg, size); if (ret < 0) goto free_skb; skb->dev = ndev; skcb = j1939_skb_to_cb(skb); memset(skcb, 0, sizeof(*skcb)); skcb->addr = jsk->addr; skcb->priority = j1939_prio(sk->sk_priority); if (msg->msg_name) { struct sockaddr_can *addr = msg->msg_name; if (addr->can_addr.j1939.name || addr->can_addr.j1939.addr != J1939_NO_ADDR) { skcb->addr.dst_name = addr->can_addr.j1939.name; skcb->addr.da = addr->can_addr.j1939.addr; } if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn)) skcb->addr.pgn = addr->can_addr.j1939.pgn; } *errcode = ret; return skb; free_skb: kfree_skb(skb); failure: *errcode = ret; return NULL; } static size_t j1939_sk_opt_stats_get_size(enum j1939_sk_errqueue_type type) { switch (type) { case J1939_ERRQUEUE_RX_RTS: return nla_total_size(sizeof(u32)) + /* J1939_NLA_TOTAL_SIZE */ nla_total_size(sizeof(u32)) + /* J1939_NLA_PGN */ nla_total_size(sizeof(u64)) + /* J1939_NLA_SRC_NAME */ nla_total_size(sizeof(u64)) + /* J1939_NLA_DEST_NAME */ nla_total_size(sizeof(u8)) + /* J1939_NLA_SRC_ADDR */ nla_total_size(sizeof(u8)) + /* J1939_NLA_DEST_ADDR */ 0; default: return nla_total_size(sizeof(u32)) + /* J1939_NLA_BYTES_ACKED */ 0; } } static struct sk_buff * j1939_sk_get_timestamping_opt_stats(struct j1939_session *session, enum j1939_sk_errqueue_type type) { struct sk_buff *stats; u32 size; stats = alloc_skb(j1939_sk_opt_stats_get_size(type), GFP_ATOMIC); if (!stats) return NULL; if (session->skcb.addr.type == J1939_SIMPLE) size = session->total_message_size; else size = min(session->pkt.tx_acked * 7, session->total_message_size); switch (type) { case J1939_ERRQUEUE_RX_RTS: nla_put_u32(stats, J1939_NLA_TOTAL_SIZE, session->total_message_size); nla_put_u32(stats, J1939_NLA_PGN, session->skcb.addr.pgn); nla_put_u64_64bit(stats, J1939_NLA_SRC_NAME, session->skcb.addr.src_name, J1939_NLA_PAD); nla_put_u64_64bit(stats, J1939_NLA_DEST_NAME, session->skcb.addr.dst_name, J1939_NLA_PAD); nla_put_u8(stats, J1939_NLA_SRC_ADDR, session->skcb.addr.sa); nla_put_u8(stats, J1939_NLA_DEST_ADDR, session->skcb.addr.da); break; default: nla_put_u32(stats, J1939_NLA_BYTES_ACKED, size); } return stats; } static void __j1939_sk_errqueue(struct j1939_session *session, struct sock *sk, enum j1939_sk_errqueue_type type) { struct j1939_priv *priv = session->priv; struct j1939_sock *jsk; struct sock_exterr_skb *serr; struct sk_buff *skb; char *state = "UNK"; u32 tsflags; int err; jsk = j1939_sk(sk); if (!(jsk->state & J1939_SOCK_ERRQUEUE)) return; tsflags = READ_ONCE(sk->sk_tsflags); switch (type) { case J1939_ERRQUEUE_TX_ACK: if (!(tsflags & SOF_TIMESTAMPING_TX_ACK)) return; break; case J1939_ERRQUEUE_TX_SCHED: if (!(tsflags & SOF_TIMESTAMPING_TX_SCHED)) return; break; case J1939_ERRQUEUE_TX_ABORT: break; case J1939_ERRQUEUE_RX_RTS: fallthrough; case J1939_ERRQUEUE_RX_DPO: fallthrough; case J1939_ERRQUEUE_RX_ABORT: if (!(tsflags & SOF_TIMESTAMPING_RX_SOFTWARE)) return; break; default: netdev_err(priv->ndev, "Unknown errqueue type %i\n", type); } skb = j1939_sk_get_timestamping_opt_stats(session, type); if (!skb) return; skb->tstamp = ktime_get_real(); BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb)); serr = SKB_EXT_ERR(skb); memset(serr, 0, sizeof(*serr)); switch (type) { case J1939_ERRQUEUE_TX_ACK: serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; serr->ee.ee_info = SCM_TSTAMP_ACK; state = "TX ACK"; break; case J1939_ERRQUEUE_TX_SCHED: serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; serr->ee.ee_info = SCM_TSTAMP_SCHED; state = "TX SCH"; break; case J1939_ERRQUEUE_TX_ABORT: serr->ee.ee_errno = session->err; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_info = J1939_EE_INFO_TX_ABORT; state = "TX ABT"; break; case J1939_ERRQUEUE_RX_RTS: serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_info = J1939_EE_INFO_RX_RTS; state = "RX RTS"; break; case J1939_ERRQUEUE_RX_DPO: serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_info = J1939_EE_INFO_RX_DPO; state = "RX DPO"; break; case J1939_ERRQUEUE_RX_ABORT: serr->ee.ee_errno = session->err; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_info = J1939_EE_INFO_RX_ABORT; state = "RX ABT"; break; } serr->opt_stats = true; if (tsflags & SOF_TIMESTAMPING_OPT_ID) serr->ee.ee_data = session->tskey; netdev_dbg(session->priv->ndev, "%s: 0x%p tskey: %i, state: %s\n", __func__, session, session->tskey, state); err = sock_queue_err_skb(sk, skb); if (err) kfree_skb(skb); }; void j1939_sk_errqueue(struct j1939_session *session, enum j1939_sk_errqueue_type type) { struct j1939_priv *priv = session->priv; struct j1939_sock *jsk; if (session->sk) { /* send TX notifications to the socket of origin */ __j1939_sk_errqueue(session, session->sk, type); return; } /* spread RX notifications to all sockets subscribed to this session */ spin_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { if (j1939_sk_recv_match_one(jsk, &session->skcb)) __j1939_sk_errqueue(session, &jsk->sk, type); } spin_unlock_bh(&priv->j1939_socks_lock); }; void j1939_sk_send_loop_abort(struct sock *sk, int err) { struct j1939_sock *jsk = j1939_sk(sk); if (jsk->state & J1939_SOCK_ERRQUEUE) return; sk->sk_err = err; sk_error_report(sk); } static int j1939_sk_send_loop(struct j1939_priv *priv, struct sock *sk, struct msghdr *msg, size_t size) { struct j1939_sock *jsk = j1939_sk(sk); struct j1939_session *session = j1939_sk_get_incomplete_session(jsk); struct sk_buff *skb; size_t segment_size, todo_size; int ret = 0; if (session && session->total_message_size != session->total_queued_size + size) { j1939_session_put(session); return -EIO; } todo_size = size; while (todo_size) { struct j1939_sk_buff_cb *skcb; segment_size = min_t(size_t, J1939_MAX_TP_PACKET_SIZE, todo_size); /* Allocate skb for one segment */ skb = j1939_sk_alloc_skb(priv->ndev, sk, msg, segment_size, &ret); if (ret) break; skcb = j1939_skb_to_cb(skb); if (!session) { /* at this point the size should be full size * of the session */ skcb->offset = 0; session = j1939_tp_send(priv, skb, size); if (IS_ERR(session)) { ret = PTR_ERR(session); goto kfree_skb; } if (j1939_sk_queue_session(session)) { /* try to activate session if we a * fist in the queue */ if (!j1939_session_activate(session)) { j1939_tp_schedule_txtimer(session, 0); } else { ret = -EBUSY; session->err = ret; j1939_sk_queue_drop_all(priv, jsk, EBUSY); break; } } } else { skcb->offset = session->total_queued_size; j1939_session_skb_queue(session, skb); } todo_size -= segment_size; session->total_queued_size += segment_size; } switch (ret) { case 0: /* OK */ if (todo_size) netdev_warn(priv->ndev, "no error found and not completely queued?! %zu\n", todo_size); ret = size; break; case -ERESTARTSYS: ret = -EINTR; fallthrough; case -EAGAIN: /* OK */ if (todo_size != size) ret = size - todo_size; break; default: /* ERROR */ break; } if (session) j1939_session_put(session); return ret; kfree_skb: kfree_skb(skb); return ret; } static int j1939_sk_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; struct j1939_sock *jsk = j1939_sk(sk); struct j1939_priv *priv; int ifindex; int ret; lock_sock(sock->sk); /* various socket state tests */ if (!(jsk->state & J1939_SOCK_BOUND)) { ret = -EBADFD; goto sendmsg_done; } priv = jsk->priv; ifindex = jsk->ifindex; if (!jsk->addr.src_name && jsk->addr.sa == J1939_NO_ADDR) { /* no source address assigned yet */ ret = -EBADFD; goto sendmsg_done; } /* deal with provided destination address info */ if (msg->msg_name) { struct sockaddr_can *addr = msg->msg_name; if (msg->msg_namelen < J1939_MIN_NAMELEN) { ret = -EINVAL; goto sendmsg_done; } if (addr->can_family != AF_CAN) { ret = -EINVAL; goto sendmsg_done; } if (addr->can_ifindex && addr->can_ifindex != ifindex) { ret = -EBADFD; goto sendmsg_done; } if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn) && !j1939_pgn_is_clean_pdu(addr->can_addr.j1939.pgn)) { ret = -EINVAL; goto sendmsg_done; } if (!addr->can_addr.j1939.name && addr->can_addr.j1939.addr == J1939_NO_ADDR && !sock_flag(sk, SOCK_BROADCAST)) { /* broadcast, but SO_BROADCAST not set */ ret = -EACCES; goto sendmsg_done; } } else { if (!jsk->addr.dst_name && jsk->addr.da == J1939_NO_ADDR && !sock_flag(sk, SOCK_BROADCAST)) { /* broadcast, but SO_BROADCAST not set */ ret = -EACCES; goto sendmsg_done; } } ret = j1939_sk_send_loop(priv, sk, msg, size); sendmsg_done: release_sock(sock->sk); return ret; } void j1939_sk_netdev_event_netdown(struct j1939_priv *priv) { struct j1939_sock *jsk; int error_code = ENETDOWN; spin_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { jsk->sk.sk_err = error_code; if (!sock_flag(&jsk->sk, SOCK_DEAD)) sk_error_report(&jsk->sk); j1939_sk_queue_drop_all(priv, jsk, error_code); } spin_unlock_bh(&priv->j1939_socks_lock); } static int j1939_sk_no_ioctlcmd(struct socket *sock, unsigned int cmd, unsigned long arg) { /* no ioctls for socket layer -> hand it down to NIC layer */ return -ENOIOCTLCMD; } static const struct proto_ops j1939_ops = { .family = PF_CAN, .release = j1939_sk_release, .bind = j1939_sk_bind, .connect = j1939_sk_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = j1939_sk_getname, .poll = datagram_poll, .ioctl = j1939_sk_no_ioctlcmd, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = j1939_sk_setsockopt, .getsockopt = j1939_sk_getsockopt, .sendmsg = j1939_sk_sendmsg, .recvmsg = j1939_sk_recvmsg, .mmap = sock_no_mmap, }; static struct proto j1939_proto __read_mostly = { .name = "CAN_J1939", .owner = THIS_MODULE, .obj_size = sizeof(struct j1939_sock), .init = j1939_sk_init, }; const struct can_proto j1939_can_proto = { .type = SOCK_DGRAM, .protocol = CAN_J1939, .ops = &j1939_ops, .prot = &j1939_proto, }; |
1262 1900 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 | /* SPDX-License-Identifier: GPL-2.0 OR MIT */ #ifndef __LINUX_OVERFLOW_H #define __LINUX_OVERFLOW_H #include <linux/compiler.h> #include <linux/limits.h> #include <linux/const.h> /* * We need to compute the minimum and maximum values representable in a given * type. These macros may also be useful elsewhere. It would seem more obvious * to do something like: * * #define type_min(T) (T)(is_signed_type(T) ? (T)1 << (8*sizeof(T)-1) : 0) * #define type_max(T) (T)(is_signed_type(T) ? ((T)1 << (8*sizeof(T)-1)) - 1 : ~(T)0) * * Unfortunately, the middle expressions, strictly speaking, have * undefined behaviour, and at least some versions of gcc warn about * the type_max expression (but not if -fsanitize=undefined is in * effect; in that case, the warning is deferred to runtime...). * * The slightly excessive casting in type_min is to make sure the * macros also produce sensible values for the exotic type _Bool. [The * overflow checkers only almost work for _Bool, but that's * a-feature-not-a-bug, since people shouldn't be doing arithmetic on * _Bools. Besides, the gcc builtins don't allow _Bool* as third * argument.] * * Idea stolen from * https://mail-index.netbsd.org/tech-misc/2007/02/05/0000.html - * credit to Christian Biere. */ #define __type_half_max(type) ((type)1 << (8*sizeof(type) - 1 - is_signed_type(type))) #define type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T))) #define type_min(T) ((T)((T)-type_max(T)-(T)1)) /* * Avoids triggering -Wtype-limits compilation warning, * while using unsigned data types to check a < 0. */ #define is_non_negative(a) ((a) > 0 || (a) == 0) #define is_negative(a) (!(is_non_negative(a))) /* * Allows for effectively applying __must_check to a macro so we can have * both the type-agnostic benefits of the macros while also being able to * enforce that the return value is, in fact, checked. */ static inline bool __must_check __must_check_overflow(bool overflow) { return unlikely(overflow); } /** * check_add_overflow() - Calculate addition with overflow checking * @a: first addend * @b: second addend * @d: pointer to store sum * * Returns 0 on success. * * *@d holds the results of the attempted addition, but is not considered * "safe for use" on a non-zero return value, which indicates that the * sum has overflowed or been truncated. */ #define check_add_overflow(a, b, d) \ __must_check_overflow(__builtin_add_overflow(a, b, d)) /** * check_sub_overflow() - Calculate subtraction with overflow checking * @a: minuend; value to subtract from * @b: subtrahend; value to subtract from @a * @d: pointer to store difference * * Returns 0 on success. * * *@d holds the results of the attempted subtraction, but is not considered * "safe for use" on a non-zero return value, which indicates that the * difference has underflowed or been truncated. */ #define check_sub_overflow(a, b, d) \ __must_check_overflow(__builtin_sub_overflow(a, b, d)) /** * check_mul_overflow() - Calculate multiplication with overflow checking * @a: first factor * @b: second factor * @d: pointer to store product * * Returns 0 on success. * * *@d holds the results of the attempted multiplication, but is not * considered "safe for use" on a non-zero return value, which indicates * that the product has overflowed or been truncated. */ #define check_mul_overflow(a, b, d) \ __must_check_overflow(__builtin_mul_overflow(a, b, d)) /** * check_shl_overflow() - Calculate a left-shifted value and check overflow * @a: Value to be shifted * @s: How many bits left to shift * @d: Pointer to where to store the result * * Computes *@d = (@a << @s) * * Returns true if '*@d' cannot hold the result or when '@a << @s' doesn't * make sense. Example conditions: * * - '@a << @s' causes bits to be lost when stored in *@d. * - '@s' is garbage (e.g. negative) or so large that the result of * '@a << @s' is guaranteed to be 0. * - '@a' is negative. * - '@a << @s' sets the sign bit, if any, in '*@d'. * * '*@d' will hold the results of the attempted shift, but is not * considered "safe for use" if true is returned. */ #define check_shl_overflow(a, s, d) __must_check_overflow(({ \ typeof(a) _a = a; \ typeof(s) _s = s; \ typeof(d) _d = d; \ u64 _a_full = _a; \ unsigned int _to_shift = \ is_non_negative(_s) && _s < 8 * sizeof(*d) ? _s : 0; \ *_d = (_a_full << _to_shift); \ (_to_shift != _s || is_negative(*_d) || is_negative(_a) || \ (*_d >> _to_shift) != _a); \ })) #define __overflows_type_constexpr(x, T) ( \ is_unsigned_type(typeof(x)) ? \ (x) > type_max(typeof(T)) : \ is_unsigned_type(typeof(T)) ? \ (x) < 0 || (x) > type_max(typeof(T)) : \ (x) < type_min(typeof(T)) || (x) > type_max(typeof(T))) #define __overflows_type(x, T) ({ \ typeof(T) v = 0; \ check_add_overflow((x), v, &v); \ }) /** * overflows_type - helper for checking the overflows between value, variables, * or data type * * @n: source constant value or variable to be checked * @T: destination variable or data type proposed to store @x * * Compares the @x expression for whether or not it can safely fit in * the storage of the type in @T. @x and @T can have different types. * If @x is a constant expression, this will also resolve to a constant * expression. * * Returns: true if overflow can occur, false otherwise. */ #define overflows_type(n, T) \ __builtin_choose_expr(__is_constexpr(n), \ __overflows_type_constexpr(n, T), \ __overflows_type(n, T)) /** * castable_to_type - like __same_type(), but also allows for casted literals * * @n: variable or constant value * @T: variable or data type * * Unlike the __same_type() macro, this allows a constant value as the * first argument. If this value would not overflow into an assignment * of the second argument's type, it returns true. Otherwise, this falls * back to __same_type(). */ #define castable_to_type(n, T) \ __builtin_choose_expr(__is_constexpr(n), \ !__overflows_type_constexpr(n, T), \ __same_type(n, T)) /** * size_mul() - Calculate size_t multiplication with saturation at SIZE_MAX * @factor1: first factor * @factor2: second factor * * Returns: calculate @factor1 * @factor2, both promoted to size_t, * with any overflow causing the return value to be SIZE_MAX. The * lvalue must be size_t to avoid implicit type conversion. */ static inline size_t __must_check size_mul(size_t factor1, size_t factor2) { size_t bytes; if (check_mul_overflow(factor1, factor2, &bytes)) return SIZE_MAX; return bytes; } /** * size_add() - Calculate size_t addition with saturation at SIZE_MAX * @addend1: first addend * @addend2: second addend * * Returns: calculate @addend1 + @addend2, both promoted to size_t, * with any overflow causing the return value to be SIZE_MAX. The * lvalue must be size_t to avoid implicit type conversion. */ static inline size_t __must_check size_add(size_t addend1, size_t addend2) { size_t bytes; if (check_add_overflow(addend1, addend2, &bytes)) return SIZE_MAX; return bytes; } /** * size_sub() - Calculate size_t subtraction with saturation at SIZE_MAX * @minuend: value to subtract from * @subtrahend: value to subtract from @minuend * * Returns: calculate @minuend - @subtrahend, both promoted to size_t, * with any overflow causing the return value to be SIZE_MAX. For * composition with the size_add() and size_mul() helpers, neither * argument may be SIZE_MAX (or the result with be forced to SIZE_MAX). * The lvalue must be size_t to avoid implicit type conversion. */ static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend) { size_t bytes; if (minuend == SIZE_MAX || subtrahend == SIZE_MAX || check_sub_overflow(minuend, subtrahend, &bytes)) return SIZE_MAX; return bytes; } /** * array_size() - Calculate size of 2-dimensional array. * @a: dimension one * @b: dimension two * * Calculates size of 2-dimensional array: @a * @b. * * Returns: number of bytes needed to represent the array or SIZE_MAX on * overflow. */ #define array_size(a, b) size_mul(a, b) /** * array3_size() - Calculate size of 3-dimensional array. * @a: dimension one * @b: dimension two * @c: dimension three * * Calculates size of 3-dimensional array: @a * @b * @c. * * Returns: number of bytes needed to represent the array or SIZE_MAX on * overflow. */ #define array3_size(a, b, c) size_mul(size_mul(a, b), c) /** * flex_array_size() - Calculate size of a flexible array member * within an enclosing structure. * @p: Pointer to the structure. * @member: Name of the flexible array member. * @count: Number of elements in the array. * * Calculates size of a flexible array of @count number of @member * elements, at the end of structure @p. * * Return: number of bytes needed or SIZE_MAX on overflow. */ #define flex_array_size(p, member, count) \ __builtin_choose_expr(__is_constexpr(count), \ (count) * sizeof(*(p)->member) + __must_be_array((p)->member), \ size_mul(count, sizeof(*(p)->member) + __must_be_array((p)->member))) /** * struct_size() - Calculate size of structure with trailing flexible array. * @p: Pointer to the structure. * @member: Name of the array member. * @count: Number of elements in the array. * * Calculates size of memory needed for structure of @p followed by an * array of @count number of @member elements. * * Return: number of bytes needed or SIZE_MAX on overflow. */ #define struct_size(p, member, count) \ __builtin_choose_expr(__is_constexpr(count), \ sizeof(*(p)) + flex_array_size(p, member, count), \ size_add(sizeof(*(p)), flex_array_size(p, member, count))) /** * struct_size_t() - Calculate size of structure with trailing flexible array * @type: structure type name. * @member: Name of the array member. * @count: Number of elements in the array. * * Calculates size of memory needed for structure @type followed by an * array of @count number of @member elements. Prefer using struct_size() * when possible instead, to keep calculations associated with a specific * instance variable of type @type. * * Return: number of bytes needed or SIZE_MAX on overflow. */ #define struct_size_t(type, member, count) \ struct_size((type *)NULL, member, count) #endif /* __LINUX_OVERFLOW_H */ |
1 1 1 1 1 1 1 1 1 1 1 1 20 20 1615 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 | // SPDX-License-Identifier: GPL-2.0 /* Copyright 2011-2014 Autronica Fire and Security AS * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se * This file contains device methods for creating, using and destroying * virtual HSR or PRP devices. */ #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <linux/pkt_sched.h> #include "hsr_device.h" #include "hsr_slave.h" #include "hsr_framereg.h" #include "hsr_main.h" #include "hsr_forward.h" static bool is_admin_up(struct net_device *dev) { return dev && (dev->flags & IFF_UP); } static bool is_slave_up(struct net_device *dev) { return dev && is_admin_up(dev) && netif_oper_up(dev); } static void __hsr_set_operstate(struct net_device *dev, int transition) { write_lock(&dev_base_lock); if (dev->operstate != transition) { dev->operstate = transition; write_unlock(&dev_base_lock); netdev_state_change(dev); } else { write_unlock(&dev_base_lock); } } static void hsr_set_operstate(struct hsr_port *master, bool has_carrier) { if (!is_admin_up(master->dev)) { __hsr_set_operstate(master->dev, IF_OPER_DOWN); return; } if (has_carrier) __hsr_set_operstate(master->dev, IF_OPER_UP); else __hsr_set_operstate(master->dev, IF_OPER_LOWERLAYERDOWN); } static bool hsr_check_carrier(struct hsr_port *master) { struct hsr_port *port; ASSERT_RTNL(); hsr_for_each_port(master->hsr, port) { if (port->type != HSR_PT_MASTER && is_slave_up(port->dev)) { netif_carrier_on(master->dev); return true; } } netif_carrier_off(master->dev); return false; } static void hsr_check_announce(struct net_device *hsr_dev, unsigned char old_operstate) { struct hsr_priv *hsr; hsr = netdev_priv(hsr_dev); if (hsr_dev->operstate == IF_OPER_UP && old_operstate != IF_OPER_UP) { /* Went up */ hsr->announce_count = 0; mod_timer(&hsr->announce_timer, jiffies + msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL)); } if (hsr_dev->operstate != IF_OPER_UP && old_operstate == IF_OPER_UP) /* Went down */ del_timer(&hsr->announce_timer); } void hsr_check_carrier_and_operstate(struct hsr_priv *hsr) { struct hsr_port *master; unsigned char old_operstate; bool has_carrier; master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); /* netif_stacked_transfer_operstate() cannot be used here since * it doesn't set IF_OPER_LOWERLAYERDOWN (?) */ old_operstate = master->dev->operstate; has_carrier = hsr_check_carrier(master); hsr_set_operstate(master, has_carrier); hsr_check_announce(master->dev, old_operstate); } int hsr_get_max_mtu(struct hsr_priv *hsr) { unsigned int mtu_max; struct hsr_port *port; mtu_max = ETH_DATA_LEN; hsr_for_each_port(hsr, port) if (port->type != HSR_PT_MASTER) mtu_max = min(port->dev->mtu, mtu_max); if (mtu_max < HSR_HLEN) return 0; return mtu_max - HSR_HLEN; } static int hsr_dev_change_mtu(struct net_device *dev, int new_mtu) { struct hsr_priv *hsr; hsr = netdev_priv(dev); if (new_mtu > hsr_get_max_mtu(hsr)) { netdev_info(dev, "A HSR master's MTU cannot be greater than the smallest MTU of its slaves minus the HSR Tag length (%d octets).\n", HSR_HLEN); return -EINVAL; } dev->mtu = new_mtu; return 0; } static int hsr_dev_open(struct net_device *dev) { struct hsr_priv *hsr; struct hsr_port *port; char designation; hsr = netdev_priv(dev); designation = '\0'; hsr_for_each_port(hsr, port) { if (port->type == HSR_PT_MASTER) continue; switch (port->type) { case HSR_PT_SLAVE_A: designation = 'A'; break; case HSR_PT_SLAVE_B: designation = 'B'; break; default: designation = '?'; } if (!is_slave_up(port->dev)) netdev_warn(dev, "Slave %c (%s) is not up; please bring it up to get a fully working HSR network\n", designation, port->dev->name); } if (designation == '\0') netdev_warn(dev, "No slave devices configured\n"); return 0; } static int hsr_dev_close(struct net_device *dev) { /* Nothing to do here. */ return 0; } static netdev_features_t hsr_features_recompute(struct hsr_priv *hsr, netdev_features_t features) { netdev_features_t mask; struct hsr_port *port; mask = features; /* Mask out all features that, if supported by one device, should be * enabled for all devices (see NETIF_F_ONE_FOR_ALL). * * Anything that's off in mask will not be enabled - so only things * that were in features originally, and also is in NETIF_F_ONE_FOR_ALL, * may become enabled. */ features &= ~NETIF_F_ONE_FOR_ALL; hsr_for_each_port(hsr, port) features = netdev_increment_features(features, port->dev->features, mask); return features; } static netdev_features_t hsr_fix_features(struct net_device *dev, netdev_features_t features) { struct hsr_priv *hsr = netdev_priv(dev); return hsr_features_recompute(hsr, features); } static netdev_tx_t hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev) { struct hsr_priv *hsr = netdev_priv(dev); struct hsr_port *master; master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); if (master) { skb->dev = master->dev; skb_reset_mac_header(skb); skb_reset_mac_len(skb); spin_lock_bh(&hsr->seqnr_lock); hsr_forward_skb(skb, master); spin_unlock_bh(&hsr->seqnr_lock); } else { dev_core_stats_tx_dropped_inc(dev); dev_kfree_skb_any(skb); } return NETDEV_TX_OK; } static const struct header_ops hsr_header_ops = { .create = eth_header, .parse = eth_header_parse, }; static struct sk_buff *hsr_init_skb(struct hsr_port *master) { struct hsr_priv *hsr = master->hsr; struct sk_buff *skb; int hlen, tlen; hlen = LL_RESERVED_SPACE(master->dev); tlen = master->dev->needed_tailroom; /* skb size is same for PRP/HSR frames, only difference * being, for PRP it is a trailer and for HSR it is a * header */ skb = dev_alloc_skb(sizeof(struct hsr_sup_tag) + sizeof(struct hsr_sup_payload) + hlen + tlen); if (!skb) return skb; skb_reserve(skb, hlen); skb->dev = master->dev; skb->priority = TC_PRIO_CONTROL; if (dev_hard_header(skb, skb->dev, ETH_P_PRP, hsr->sup_multicast_addr, skb->dev->dev_addr, skb->len) <= 0) goto out; skb_reset_mac_header(skb); skb_reset_mac_len(skb); skb_reset_network_header(skb); skb_reset_transport_header(skb); return skb; out: kfree_skb(skb); return NULL; } static void send_hsr_supervision_frame(struct hsr_port *master, unsigned long *interval) { struct hsr_priv *hsr = master->hsr; __u8 type = HSR_TLV_LIFE_CHECK; struct hsr_sup_payload *hsr_sp; struct hsr_sup_tag *hsr_stag; struct sk_buff *skb; *interval = msecs_to_jiffies(HSR_LIFE_CHECK_INTERVAL); if (hsr->announce_count < 3 && hsr->prot_version == 0) { type = HSR_TLV_ANNOUNCE; *interval = msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL); hsr->announce_count++; } skb = hsr_init_skb(master); if (!skb) { WARN_ONCE(1, "HSR: Could not send supervision frame\n"); return; } hsr_stag = skb_put(skb, sizeof(struct hsr_sup_tag)); set_hsr_stag_path(hsr_stag, (hsr->prot_version ? 0x0 : 0xf)); set_hsr_stag_HSR_ver(hsr_stag, hsr->prot_version); /* From HSRv1 on we have separate supervision sequence numbers. */ spin_lock_bh(&hsr->seqnr_lock); if (hsr->prot_version > 0) { hsr_stag->sequence_nr = htons(hsr->sup_sequence_nr); hsr->sup_sequence_nr++; } else { hsr_stag->sequence_nr = htons(hsr->sequence_nr); hsr->sequence_nr++; } hsr_stag->tlv.HSR_TLV_type = type; /* TODO: Why 12 in HSRv0? */ hsr_stag->tlv.HSR_TLV_length = hsr->prot_version ? sizeof(struct hsr_sup_payload) : 12; /* Payload: MacAddressA */ hsr_sp = skb_put(skb, sizeof(struct hsr_sup_payload)); ether_addr_copy(hsr_sp->macaddress_A, master->dev->dev_addr); if (skb_put_padto(skb, ETH_ZLEN)) { spin_unlock_bh(&hsr->seqnr_lock); return; } hsr_forward_skb(skb, master); spin_unlock_bh(&hsr->seqnr_lock); return; } static void send_prp_supervision_frame(struct hsr_port *master, unsigned long *interval) { struct hsr_priv *hsr = master->hsr; struct hsr_sup_payload *hsr_sp; struct hsr_sup_tag *hsr_stag; struct sk_buff *skb; skb = hsr_init_skb(master); if (!skb) { WARN_ONCE(1, "PRP: Could not send supervision frame\n"); return; } *interval = msecs_to_jiffies(HSR_LIFE_CHECK_INTERVAL); hsr_stag = skb_put(skb, sizeof(struct hsr_sup_tag)); set_hsr_stag_path(hsr_stag, (hsr->prot_version ? 0x0 : 0xf)); set_hsr_stag_HSR_ver(hsr_stag, (hsr->prot_version ? 1 : 0)); /* From HSRv1 on we have separate supervision sequence numbers. */ spin_lock_bh(&hsr->seqnr_lock); hsr_stag->sequence_nr = htons(hsr->sup_sequence_nr); hsr->sup_sequence_nr++; hsr_stag->tlv.HSR_TLV_type = PRP_TLV_LIFE_CHECK_DD; hsr_stag->tlv.HSR_TLV_length = sizeof(struct hsr_sup_payload); /* Payload: MacAddressA */ hsr_sp = skb_put(skb, sizeof(struct hsr_sup_payload)); ether_addr_copy(hsr_sp->macaddress_A, master->dev->dev_addr); if (skb_put_padto(skb, ETH_ZLEN)) { spin_unlock_bh(&hsr->seqnr_lock); return; } hsr_forward_skb(skb, master); spin_unlock_bh(&hsr->seqnr_lock); } /* Announce (supervision frame) timer function */ static void hsr_announce(struct timer_list *t) { struct hsr_priv *hsr; struct hsr_port *master; unsigned long interval; hsr = from_timer(hsr, t, announce_timer); rcu_read_lock(); master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); hsr->proto_ops->send_sv_frame(master, &interval); if (is_admin_up(master->dev)) mod_timer(&hsr->announce_timer, jiffies + interval); rcu_read_unlock(); } void hsr_del_ports(struct hsr_priv *hsr) { struct hsr_port *port; port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A); if (port) hsr_del_port(port); port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B); if (port) hsr_del_port(port); port = hsr_port_get_hsr(hsr, HSR_PT_MASTER); if (port) hsr_del_port(port); } static const struct net_device_ops hsr_device_ops = { .ndo_change_mtu = hsr_dev_change_mtu, .ndo_open = hsr_dev_open, .ndo_stop = hsr_dev_close, .ndo_start_xmit = hsr_dev_xmit, .ndo_fix_features = hsr_fix_features, }; static struct device_type hsr_type = { .name = "hsr", }; static struct hsr_proto_ops hsr_ops = { .send_sv_frame = send_hsr_supervision_frame, .create_tagged_frame = hsr_create_tagged_frame, .get_untagged_frame = hsr_get_untagged_frame, .drop_frame = hsr_drop_frame, .fill_frame_info = hsr_fill_frame_info, .invalid_dan_ingress_frame = hsr_invalid_dan_ingress_frame, }; static struct hsr_proto_ops prp_ops = { .send_sv_frame = send_prp_supervision_frame, .create_tagged_frame = prp_create_tagged_frame, .get_untagged_frame = prp_get_untagged_frame, .drop_frame = prp_drop_frame, .fill_frame_info = prp_fill_frame_info, .handle_san_frame = prp_handle_san_frame, .update_san_info = prp_update_san_info, }; void hsr_dev_setup(struct net_device *dev) { eth_hw_addr_random(dev); ether_setup(dev); dev->min_mtu = 0; dev->header_ops = &hsr_header_ops; dev->netdev_ops = &hsr_device_ops; SET_NETDEV_DEVTYPE(dev, &hsr_type); dev->priv_flags |= IFF_NO_QUEUE | IFF_DISABLE_NETPOLL; dev->needs_free_netdev = true; dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | NETIF_F_GSO_MASK | NETIF_F_HW_CSUM | NETIF_F_HW_VLAN_CTAG_TX; dev->features = dev->hw_features; /* Prevent recursive tx locking */ dev->features |= NETIF_F_LLTX; /* VLAN on top of HSR needs testing and probably some work on * hsr_header_create() etc. */ dev->features |= NETIF_F_VLAN_CHALLENGED; /* Not sure about this. Taken from bridge code. netdev_features.h says * it means "Does not change network namespaces". */ dev->features |= NETIF_F_NETNS_LOCAL; } /* Return true if dev is a HSR master; return false otherwise. */ bool is_hsr_master(struct net_device *dev) { return (dev->netdev_ops->ndo_start_xmit == hsr_dev_xmit); } EXPORT_SYMBOL(is_hsr_master); /* Default multicast address for HSR Supervision frames */ static const unsigned char def_multicast_addr[ETH_ALEN] __aligned(2) = { 0x01, 0x15, 0x4e, 0x00, 0x01, 0x00 }; int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], unsigned char multicast_spec, u8 protocol_version, struct netlink_ext_ack *extack) { bool unregister = false; struct hsr_priv *hsr; int res; hsr = netdev_priv(hsr_dev); INIT_LIST_HEAD(&hsr->ports); INIT_LIST_HEAD(&hsr->node_db); spin_lock_init(&hsr->list_lock); eth_hw_addr_set(hsr_dev, slave[0]->dev_addr); /* initialize protocol specific functions */ if (protocol_version == PRP_V1) { /* For PRP, lan_id has most significant 3 bits holding * the net_id of PRP_LAN_ID */ hsr->net_id = PRP_LAN_ID << 1; hsr->proto_ops = &prp_ops; } else { hsr->proto_ops = &hsr_ops; } /* Make sure we recognize frames from ourselves in hsr_rcv() */ res = hsr_create_self_node(hsr, hsr_dev->dev_addr, slave[1]->dev_addr); if (res < 0) return res; spin_lock_init(&hsr->seqnr_lock); /* Overflow soon to find bugs easier: */ hsr->sequence_nr = HSR_SEQNR_START; hsr->sup_sequence_nr = HSR_SUP_SEQNR_START; timer_setup(&hsr->announce_timer, hsr_announce, 0); timer_setup(&hsr->prune_timer, hsr_prune_nodes, 0); ether_addr_copy(hsr->sup_multicast_addr, def_multicast_addr); hsr->sup_multicast_addr[ETH_ALEN - 1] = multicast_spec; hsr->prot_version = protocol_version; /* Make sure the 1st call to netif_carrier_on() gets through */ netif_carrier_off(hsr_dev); res = hsr_add_port(hsr, hsr_dev, HSR_PT_MASTER, extack); if (res) goto err_add_master; /* HSR forwarding offload supported in lower device? */ if ((slave[0]->features & NETIF_F_HW_HSR_FWD) && (slave[1]->features & NETIF_F_HW_HSR_FWD)) hsr->fwd_offloaded = true; res = register_netdevice(hsr_dev); if (res) goto err_unregister; unregister = true; res = hsr_add_port(hsr, slave[0], HSR_PT_SLAVE_A, extack); if (res) goto err_unregister; res = hsr_add_port(hsr, slave[1], HSR_PT_SLAVE_B, extack); if (res) goto err_unregister; hsr_debugfs_init(hsr, hsr_dev); mod_timer(&hsr->prune_timer, jiffies + msecs_to_jiffies(PRUNE_PERIOD)); return 0; err_unregister: hsr_del_ports(hsr); err_add_master: hsr_del_self_node(hsr); if (unregister) unregister_netdevice(hsr_dev); return res; } |
8 8 8 3 3 3 3 12 11 10 9 1 9 6 6 6 6 5 12 2 1 2 1 4 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 | // SPDX-License-Identifier: GPL-2.0-or-later /* L2TPv3 IP encapsulation support * * Copyright (c) 2008,2009,2010 Katalix Systems Ltd */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <asm/ioctls.h> #include <linux/icmp.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/random.h> #include <linux/socket.h> #include <linux/l2tp.h> #include <linux/in.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> #include <net/udp.h> #include <net/inet_common.h> #include <net/tcp_states.h> #include <net/protocol.h> #include <net/xfrm.h> #include "l2tp_core.h" struct l2tp_ip_sock { /* inet_sock has to be the first member of l2tp_ip_sock */ struct inet_sock inet; u32 conn_id; u32 peer_conn_id; }; static DEFINE_RWLOCK(l2tp_ip_lock); static struct hlist_head l2tp_ip_table; static struct hlist_head l2tp_ip_bind_table; static inline struct l2tp_ip_sock *l2tp_ip_sk(const struct sock *sk) { return (struct l2tp_ip_sock *)sk; } static struct sock *__l2tp_ip_bind_lookup(const struct net *net, __be32 laddr, __be32 raddr, int dif, u32 tunnel_id) { struct sock *sk; sk_for_each_bound(sk, &l2tp_ip_bind_table) { const struct l2tp_ip_sock *l2tp = l2tp_ip_sk(sk); const struct inet_sock *inet = inet_sk(sk); int bound_dev_if; if (!net_eq(sock_net(sk), net)) continue; bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); if (bound_dev_if && dif && bound_dev_if != dif) continue; if (inet->inet_rcv_saddr && laddr && inet->inet_rcv_saddr != laddr) continue; if (inet->inet_daddr && raddr && inet->inet_daddr != raddr) continue; if (l2tp->conn_id != tunnel_id) continue; goto found; } sk = NULL; found: return sk; } /* When processing receive frames, there are two cases to * consider. Data frames consist of a non-zero session-id and an * optional cookie. Control frames consist of a regular L2TP header * preceded by 32-bits of zeros. * * L2TPv3 Session Header Over IP * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Session ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Cookie (optional, maximum 64 bits)... * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * L2TPv3 Control Message Header Over IP * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | (32 bits of zeros) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |T|L|x|x|S|x|x|x|x|x|x|x| Ver | Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Control Connection ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Ns | Nr | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * All control frames are passed to userspace. */ static int l2tp_ip_recv(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); struct sock *sk; u32 session_id; u32 tunnel_id; unsigned char *ptr, *optr; struct l2tp_session *session; struct l2tp_tunnel *tunnel = NULL; struct iphdr *iph; if (!pskb_may_pull(skb, 4)) goto discard; /* Point to L2TP header */ optr = skb->data; ptr = skb->data; session_id = ntohl(*((__be32 *)ptr)); ptr += 4; /* RFC3931: L2TP/IP packets have the first 4 bytes containing * the session_id. If it is 0, the packet is a L2TP control * frame and the session_id value can be discarded. */ if (session_id == 0) { __skb_pull(skb, 4); goto pass_up; } /* Ok, this is a data packet. Lookup the session. */ session = l2tp_session_get(net, session_id); if (!session) goto discard; tunnel = session->tunnel; if (!tunnel) goto discard_sess; if (l2tp_v3_ensure_opt_in_linear(session, skb, &ptr, &optr)) goto discard_sess; l2tp_recv_common(session, skb, ptr, optr, 0, skb->len); l2tp_session_dec_refcount(session); return 0; pass_up: /* Get the tunnel_id from the L2TP header */ if (!pskb_may_pull(skb, 12)) goto discard; if ((skb->data[0] & 0xc0) != 0xc0) goto discard; tunnel_id = ntohl(*(__be32 *)&skb->data[4]); iph = (struct iphdr *)skb_network_header(skb); read_lock_bh(&l2tp_ip_lock); sk = __l2tp_ip_bind_lookup(net, iph->daddr, iph->saddr, inet_iif(skb), tunnel_id); if (!sk) { read_unlock_bh(&l2tp_ip_lock); goto discard; } sock_hold(sk); read_unlock_bh(&l2tp_ip_lock); if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_put; nf_reset_ct(skb); return sk_receive_skb(sk, skb, 1); discard_sess: l2tp_session_dec_refcount(session); goto discard; discard_put: sock_put(sk); discard: kfree_skb(skb); return 0; } static int l2tp_ip_hash(struct sock *sk) { if (sk_unhashed(sk)) { write_lock_bh(&l2tp_ip_lock); sk_add_node(sk, &l2tp_ip_table); write_unlock_bh(&l2tp_ip_lock); } return 0; } static void l2tp_ip_unhash(struct sock *sk) { if (sk_unhashed(sk)) return; write_lock_bh(&l2tp_ip_lock); sk_del_node_init(sk); write_unlock_bh(&l2tp_ip_lock); } static int l2tp_ip_open(struct sock *sk) { /* Prevent autobind. We don't have ports. */ inet_sk(sk)->inet_num = IPPROTO_L2TP; l2tp_ip_hash(sk); return 0; } static void l2tp_ip_close(struct sock *sk, long timeout) { write_lock_bh(&l2tp_ip_lock); hlist_del_init(&sk->sk_bind_node); sk_del_node_init(sk); write_unlock_bh(&l2tp_ip_lock); sk_common_release(sk); } static void l2tp_ip_destroy_sock(struct sock *sk) { struct l2tp_tunnel *tunnel = l2tp_sk_to_tunnel(sk); struct sk_buff *skb; while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) kfree_skb(skb); if (tunnel) l2tp_tunnel_delete(tunnel); } static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); struct sockaddr_l2tpip *addr = (struct sockaddr_l2tpip *)uaddr; struct net *net = sock_net(sk); int ret; int chk_addr_ret; if (addr_len < sizeof(struct sockaddr_l2tpip)) return -EINVAL; if (addr->l2tp_family != AF_INET) return -EINVAL; lock_sock(sk); ret = -EINVAL; if (!sock_flag(sk, SOCK_ZAPPED)) goto out; if (sk->sk_state != TCP_CLOSE) goto out; chk_addr_ret = inet_addr_type(net, addr->l2tp_addr.s_addr); ret = -EADDRNOTAVAIL; if (addr->l2tp_addr.s_addr && chk_addr_ret != RTN_LOCAL && chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) goto out; if (addr->l2tp_addr.s_addr) { inet->inet_rcv_saddr = addr->l2tp_addr.s_addr; inet->inet_saddr = addr->l2tp_addr.s_addr; } if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) inet->inet_saddr = 0; /* Use device */ write_lock_bh(&l2tp_ip_lock); if (__l2tp_ip_bind_lookup(net, addr->l2tp_addr.s_addr, 0, sk->sk_bound_dev_if, addr->l2tp_conn_id)) { write_unlock_bh(&l2tp_ip_lock); ret = -EADDRINUSE; goto out; } sk_dst_reset(sk); l2tp_ip_sk(sk)->conn_id = addr->l2tp_conn_id; sk_add_bind_node(sk, &l2tp_ip_bind_table); sk_del_node_init(sk); write_unlock_bh(&l2tp_ip_lock); ret = 0; sock_reset_flag(sk, SOCK_ZAPPED); out: release_sock(sk); return ret; } static int l2tp_ip_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_l2tpip *lsa = (struct sockaddr_l2tpip *)uaddr; int rc; if (addr_len < sizeof(*lsa)) return -EINVAL; if (ipv4_is_multicast(lsa->l2tp_addr.s_addr)) return -EINVAL; lock_sock(sk); /* Must bind first - autobinding does not work */ if (sock_flag(sk, SOCK_ZAPPED)) { rc = -EINVAL; goto out_sk; } rc = __ip4_datagram_connect(sk, uaddr, addr_len); if (rc < 0) goto out_sk; l2tp_ip_sk(sk)->peer_conn_id = lsa->l2tp_conn_id; write_lock_bh(&l2tp_ip_lock); hlist_del_init(&sk->sk_bind_node); sk_add_bind_node(sk, &l2tp_ip_bind_table); write_unlock_bh(&l2tp_ip_lock); out_sk: release_sock(sk); return rc; } static int l2tp_ip_disconnect(struct sock *sk, int flags) { if (sock_flag(sk, SOCK_ZAPPED)) return 0; return __udp_disconnect(sk, flags); } static int l2tp_ip_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sock *sk = sock->sk; struct inet_sock *inet = inet_sk(sk); struct l2tp_ip_sock *lsk = l2tp_ip_sk(sk); struct sockaddr_l2tpip *lsa = (struct sockaddr_l2tpip *)uaddr; memset(lsa, 0, sizeof(*lsa)); lsa->l2tp_family = AF_INET; if (peer) { if (!inet->inet_dport) return -ENOTCONN; lsa->l2tp_conn_id = lsk->peer_conn_id; lsa->l2tp_addr.s_addr = inet->inet_daddr; } else { __be32 addr = inet->inet_rcv_saddr; if (!addr) addr = inet->inet_saddr; lsa->l2tp_conn_id = lsk->conn_id; lsa->l2tp_addr.s_addr = addr; } return sizeof(*lsa); } static int l2tp_ip_backlog_recv(struct sock *sk, struct sk_buff *skb) { int rc; /* Charge it to the socket, dropping if the queue is full. */ rc = sock_queue_rcv_skb(sk, skb); if (rc < 0) goto drop; return 0; drop: IP_INC_STATS(sock_net(sk), IPSTATS_MIB_INDISCARDS); kfree_skb(skb); return 0; } /* Userspace will call sendmsg() on the tunnel socket to send L2TP * control frames. */ static int l2tp_ip_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct sk_buff *skb; int rc; struct inet_sock *inet = inet_sk(sk); struct rtable *rt = NULL; struct flowi4 *fl4; int connected = 0; __be32 daddr; lock_sock(sk); rc = -ENOTCONN; if (sock_flag(sk, SOCK_DEAD)) goto out; /* Get and verify the address. */ if (msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_l2tpip *, lip, msg->msg_name); rc = -EINVAL; if (msg->msg_namelen < sizeof(*lip)) goto out; if (lip->l2tp_family != AF_INET) { rc = -EAFNOSUPPORT; if (lip->l2tp_family != AF_UNSPEC) goto out; } daddr = lip->l2tp_addr.s_addr; } else { rc = -EDESTADDRREQ; if (sk->sk_state != TCP_ESTABLISHED) goto out; daddr = inet->inet_daddr; connected = 1; } /* Allocate a socket buffer */ rc = -ENOMEM; skb = sock_wmalloc(sk, 2 + NET_SKB_PAD + sizeof(struct iphdr) + 4 + len, 0, GFP_KERNEL); if (!skb) goto error; /* Reserve space for headers, putting IP header on 4-byte boundary. */ skb_reserve(skb, 2 + NET_SKB_PAD); skb_reset_network_header(skb); skb_reserve(skb, sizeof(struct iphdr)); skb_reset_transport_header(skb); /* Insert 0 session_id */ *((__be32 *)skb_put(skb, 4)) = 0; /* Copy user data into skb */ rc = memcpy_from_msg(skb_put(skb, len), msg, len); if (rc < 0) { kfree_skb(skb); goto error; } fl4 = &inet->cork.fl.u.ip4; if (connected) rt = (struct rtable *)__sk_dst_check(sk, 0); rcu_read_lock(); if (!rt) { const struct ip_options_rcu *inet_opt; inet_opt = rcu_dereference(inet->inet_opt); /* Use correct destination address if we have options. */ if (inet_opt && inet_opt->opt.srr) daddr = inet_opt->opt.faddr; /* If this fails, retransmit mechanism of transport layer will * keep trying until route appears or the connection times * itself out. */ rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr, inet->inet_dport, inet->inet_sport, sk->sk_protocol, RT_CONN_FLAGS(sk), sk->sk_bound_dev_if); if (IS_ERR(rt)) goto no_route; if (connected) { sk_setup_caps(sk, &rt->dst); } else { skb_dst_set(skb, &rt->dst); goto xmit; } } /* We don't need to clone dst here, it is guaranteed to not disappear. * __dev_xmit_skb() might force a refcount if needed. */ skb_dst_set_noref(skb, &rt->dst); xmit: /* Queue the packet to IP for output */ rc = ip_queue_xmit(sk, skb, &inet->cork.fl); rcu_read_unlock(); error: if (rc >= 0) rc = len; out: release_sock(sk); return rc; no_route: rcu_read_unlock(); IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); kfree_skb(skb); rc = -EHOSTUNREACH; goto out; } static int l2tp_ip_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct inet_sock *inet = inet_sk(sk); size_t copied = 0; int err = -EOPNOTSUPP; DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct sk_buff *skb; if (flags & MSG_OOB) goto out; skb = skb_recv_datagram(sk, flags, &err); if (!skb) goto out; copied = skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; sock_recv_timestamp(msg, sk, skb); /* Copy the address. */ if (sin) { sin->sin_family = AF_INET; sin->sin_addr.s_addr = ip_hdr(skb)->saddr; sin->sin_port = 0; memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); *addr_len = sizeof(*sin); } if (inet_cmsg_flags(inet)) ip_cmsg_recv(msg, skb); if (flags & MSG_TRUNC) copied = skb->len; done: skb_free_datagram(sk, skb); out: return err ? err : copied; } int l2tp_ioctl(struct sock *sk, int cmd, int *karg) { struct sk_buff *skb; switch (cmd) { case SIOCOUTQ: *karg = sk_wmem_alloc_get(sk); break; case SIOCINQ: spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); *karg = skb ? skb->len : 0; spin_unlock_bh(&sk->sk_receive_queue.lock); break; default: return -ENOIOCTLCMD; } return 0; } EXPORT_SYMBOL_GPL(l2tp_ioctl); static struct proto l2tp_ip_prot = { .name = "L2TP/IP", .owner = THIS_MODULE, .init = l2tp_ip_open, .close = l2tp_ip_close, .bind = l2tp_ip_bind, .connect = l2tp_ip_connect, .disconnect = l2tp_ip_disconnect, .ioctl = l2tp_ioctl, .destroy = l2tp_ip_destroy_sock, .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, .sendmsg = l2tp_ip_sendmsg, .recvmsg = l2tp_ip_recvmsg, .backlog_rcv = l2tp_ip_backlog_recv, .hash = l2tp_ip_hash, .unhash = l2tp_ip_unhash, .obj_size = sizeof(struct l2tp_ip_sock), }; static const struct proto_ops l2tp_ip_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, .bind = inet_bind, .connect = inet_dgram_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = l2tp_ip_getname, .poll = datagram_poll, .ioctl = inet_ioctl, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, }; static struct inet_protosw l2tp_ip_protosw = { .type = SOCK_DGRAM, .protocol = IPPROTO_L2TP, .prot = &l2tp_ip_prot, .ops = &l2tp_ip_ops, }; static struct net_protocol l2tp_ip_protocol __read_mostly = { .handler = l2tp_ip_recv, }; static int __init l2tp_ip_init(void) { int err; pr_info("L2TP IP encapsulation support (L2TPv3)\n"); err = proto_register(&l2tp_ip_prot, 1); if (err != 0) goto out; err = inet_add_protocol(&l2tp_ip_protocol, IPPROTO_L2TP); if (err) goto out1; inet_register_protosw(&l2tp_ip_protosw); return 0; out1: proto_unregister(&l2tp_ip_prot); out: return err; } static void __exit l2tp_ip_exit(void) { inet_unregister_protosw(&l2tp_ip_protosw); inet_del_protocol(&l2tp_ip_protocol, IPPROTO_L2TP); proto_unregister(&l2tp_ip_prot); } module_init(l2tp_ip_init); module_exit(l2tp_ip_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("James Chapman <jchapman@katalix.com>"); MODULE_DESCRIPTION("L2TP over IP"); MODULE_VERSION("1.0"); /* Use the values of SOCK_DGRAM (2) as type and IPPROTO_L2TP (115) as protocol, * because __stringify doesn't like enums */ MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 115, 2); MODULE_ALIAS_NET_PF_PROTO(PF_INET, 115); |
3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 | /* * af_llc.c - LLC User Interface SAPs * Description: * Functions in this module are implementation of socket based llc * communications for the Linux operating system. Support of llc class * one and class two is provided via SOCK_DGRAM and SOCK_STREAM * respectively. * * An llc2 connection is (mac + sap), only one llc2 sap connection * is allowed per mac. Though one sap may have multiple mac + sap * connections. * * Copyright (c) 2001 by Jay Schulist <jschlst@samba.org> * 2002-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <linux/compiler.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/rtnetlink.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <net/llc.h> #include <net/llc_sap.h> #include <net/llc_pdu.h> #include <net/llc_conn.h> #include <net/tcp_states.h> /* remember: uninitialized global data is zeroed because its in .bss */ static u16 llc_ui_sap_last_autoport = LLC_SAP_DYN_START; static u16 llc_ui_sap_link_no_max[256]; static struct sockaddr_llc llc_ui_addrnull; static const struct proto_ops llc_ui_ops; static bool llc_ui_wait_for_conn(struct sock *sk, long timeout); static int llc_ui_wait_for_disc(struct sock *sk, long timeout); static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout); #if 0 #define dprintk(args...) printk(KERN_DEBUG args) #else #define dprintk(args...) do {} while (0) #endif /* Maybe we'll add some more in the future. */ #define LLC_CMSG_PKTINFO 1 /** * llc_ui_next_link_no - return the next unused link number for a sap * @sap: Address of sap to get link number from. * * Return the next unused link number for a given sap. */ static inline u16 llc_ui_next_link_no(int sap) { return llc_ui_sap_link_no_max[sap]++; } /** * llc_proto_type - return eth protocol for ARP header type * @arphrd: ARP header type. * * Given an ARP header type return the corresponding ethernet protocol. */ static inline __be16 llc_proto_type(u16 arphrd) { return htons(ETH_P_802_2); } /** * llc_ui_addr_null - determines if a address structure is null * @addr: Address to test if null. */ static inline u8 llc_ui_addr_null(struct sockaddr_llc *addr) { return !memcmp(addr, &llc_ui_addrnull, sizeof(*addr)); } /** * llc_ui_header_len - return length of llc header based on operation * @sk: Socket which contains a valid llc socket type. * @addr: Complete sockaddr_llc structure received from the user. * * Provide the length of the llc header depending on what kind of * operation the user would like to perform and the type of socket. * Returns the correct llc header length. */ static inline u8 llc_ui_header_len(struct sock *sk, struct sockaddr_llc *addr) { u8 rc = LLC_PDU_LEN_U; if (addr->sllc_test) rc = LLC_PDU_LEN_U; else if (addr->sllc_xid) /* We need to expand header to sizeof(struct llc_xid_info) * since llc_pdu_init_as_xid_cmd() sets 4,5,6 bytes of LLC header * as XID PDU. In llc_ui_sendmsg() we reserved header size and then * filled all other space with user data. If we won't reserve this * bytes, llc_pdu_init_as_xid_cmd() will overwrite user data */ rc = LLC_PDU_LEN_U_XID; else if (sk->sk_type == SOCK_STREAM) rc = LLC_PDU_LEN_I; return rc; } /** * llc_ui_send_data - send data via reliable llc2 connection * @sk: Connection the socket is using. * @skb: Data the user wishes to send. * @noblock: can we block waiting for data? * * Send data via reliable llc2 connection. * Returns 0 upon success, non-zero if action did not succeed. * * This function always consumes a reference to the skb. */ static int llc_ui_send_data(struct sock* sk, struct sk_buff *skb, int noblock) { struct llc_sock* llc = llc_sk(sk); if (unlikely(llc_data_accept_state(llc->state) || llc->remote_busy_flag || llc->p_flag)) { long timeout = sock_sndtimeo(sk, noblock); int rc; rc = llc_ui_wait_for_busy_core(sk, timeout); if (rc) { kfree_skb(skb); return rc; } } return llc_build_and_send_pkt(sk, skb); } static void llc_ui_sk_init(struct socket *sock, struct sock *sk) { sock_graft(sk, sock); sk->sk_type = sock->type; sock->ops = &llc_ui_ops; } static struct proto llc_proto = { .name = "LLC", .owner = THIS_MODULE, .obj_size = sizeof(struct llc_sock), .slab_flags = SLAB_TYPESAFE_BY_RCU, }; /** * llc_ui_create - alloc and init a new llc_ui socket * @net: network namespace (must be default network) * @sock: Socket to initialize and attach allocated sk to. * @protocol: Unused. * @kern: on behalf of kernel or userspace * * Allocate and initialize a new llc_ui socket, validate the user wants a * socket type we have available. * Returns 0 upon success, negative upon failure. */ static int llc_ui_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; int rc = -ESOCKTNOSUPPORT; if (!ns_capable(net->user_ns, CAP_NET_RAW)) return -EPERM; if (!net_eq(net, &init_net)) return -EAFNOSUPPORT; if (likely(sock->type == SOCK_DGRAM || sock->type == SOCK_STREAM)) { rc = -ENOMEM; sk = llc_sk_alloc(net, PF_LLC, GFP_KERNEL, &llc_proto, kern); if (sk) { rc = 0; llc_ui_sk_init(sock, sk); } } return rc; } /** * llc_ui_release - shutdown socket * @sock: Socket to release. * * Shutdown and deallocate an existing socket. */ static int llc_ui_release(struct socket *sock) { struct sock *sk = sock->sk; struct llc_sock *llc; if (unlikely(sk == NULL)) goto out; sock_hold(sk); lock_sock(sk); llc = llc_sk(sk); dprintk("%s: closing local(%02X) remote(%02X)\n", __func__, llc->laddr.lsap, llc->daddr.lsap); if (!llc_send_disc(sk)) llc_ui_wait_for_disc(sk, sk->sk_rcvtimeo); if (!sock_flag(sk, SOCK_ZAPPED)) { struct llc_sap *sap = llc->sap; /* Hold this for release_sock(), so that llc_backlog_rcv() * could still use it. */ llc_sap_hold(sap); llc_sap_remove_socket(llc->sap, sk); release_sock(sk); llc_sap_put(sap); } else { release_sock(sk); } netdev_put(llc->dev, &llc->dev_tracker); sock_put(sk); llc_sk_free(sk); out: return 0; } /** * llc_ui_autoport - provide dynamically allocate SAP number * * Provide the caller with a dynamically allocated SAP number according * to the rules that are set in this function. Returns: 0, upon failure, * SAP number otherwise. */ static int llc_ui_autoport(void) { struct llc_sap *sap; int i, tries = 0; while (tries < LLC_SAP_DYN_TRIES) { for (i = llc_ui_sap_last_autoport; i < LLC_SAP_DYN_STOP; i += 2) { sap = llc_sap_find(i); if (!sap) { llc_ui_sap_last_autoport = i + 2; goto out; } llc_sap_put(sap); } llc_ui_sap_last_autoport = LLC_SAP_DYN_START; tries++; } i = 0; out: return i; } /** * llc_ui_autobind - automatically bind a socket to a sap * @sock: socket to bind * @addr: address to connect to * * Used by llc_ui_connect and llc_ui_sendmsg when the user hasn't * specifically used llc_ui_bind to bind to an specific address/sap * * Returns: 0 upon success, negative otherwise. */ static int llc_ui_autobind(struct socket *sock, struct sockaddr_llc *addr) { struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); struct net_device *dev = NULL; struct llc_sap *sap; int rc = -EINVAL; if (!sock_flag(sk, SOCK_ZAPPED)) goto out; if (!addr->sllc_arphrd) addr->sllc_arphrd = ARPHRD_ETHER; if (addr->sllc_arphrd != ARPHRD_ETHER) goto out; rc = -ENODEV; if (sk->sk_bound_dev_if) { dev = dev_get_by_index(&init_net, sk->sk_bound_dev_if); if (dev && addr->sllc_arphrd != dev->type) { dev_put(dev); dev = NULL; } } else dev = dev_getfirstbyhwtype(&init_net, addr->sllc_arphrd); if (!dev) goto out; rc = -EUSERS; llc->laddr.lsap = llc_ui_autoport(); if (!llc->laddr.lsap) goto out; rc = -EBUSY; /* some other network layer is using the sap */ sap = llc_sap_open(llc->laddr.lsap, NULL); if (!sap) goto out; /* Note: We do not expect errors from this point. */ llc->dev = dev; netdev_tracker_alloc(llc->dev, &llc->dev_tracker, GFP_KERNEL); dev = NULL; memcpy(llc->laddr.mac, llc->dev->dev_addr, IFHWADDRLEN); memcpy(&llc->addr, addr, sizeof(llc->addr)); /* assign new connection to its SAP */ llc_sap_add_socket(sap, sk); sock_reset_flag(sk, SOCK_ZAPPED); rc = 0; out: dev_put(dev); return rc; } /** * llc_ui_bind - bind a socket to a specific address. * @sock: Socket to bind an address to. * @uaddr: Address the user wants the socket bound to. * @addrlen: Length of the uaddr structure. * * Bind a socket to a specific address. For llc a user is able to bind to * a specific sap only or mac + sap. * If the user desires to bind to a specific mac + sap, it is possible to * have multiple sap connections via multiple macs. * Bind and autobind for that matter must enforce the correct sap usage * otherwise all hell will break loose. * Returns: 0 upon success, negative otherwise. */ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen) { struct sockaddr_llc *addr = (struct sockaddr_llc *)uaddr; struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); struct net_device *dev = NULL; struct llc_sap *sap; int rc = -EINVAL; lock_sock(sk); if (unlikely(!sock_flag(sk, SOCK_ZAPPED) || addrlen != sizeof(*addr))) goto out; rc = -EAFNOSUPPORT; if (!addr->sllc_arphrd) addr->sllc_arphrd = ARPHRD_ETHER; if (unlikely(addr->sllc_family != AF_LLC || addr->sllc_arphrd != ARPHRD_ETHER)) goto out; dprintk("%s: binding %02X\n", __func__, addr->sllc_sap); rc = -ENODEV; rcu_read_lock(); if (sk->sk_bound_dev_if) { dev = dev_get_by_index_rcu(&init_net, sk->sk_bound_dev_if); if (dev) { if (is_zero_ether_addr(addr->sllc_mac)) memcpy(addr->sllc_mac, dev->dev_addr, IFHWADDRLEN); if (addr->sllc_arphrd != dev->type || !ether_addr_equal(addr->sllc_mac, dev->dev_addr)) { rc = -EINVAL; dev = NULL; } } } else { dev = dev_getbyhwaddr_rcu(&init_net, addr->sllc_arphrd, addr->sllc_mac); } dev_hold(dev); rcu_read_unlock(); if (!dev) goto out; if (!addr->sllc_sap) { rc = -EUSERS; addr->sllc_sap = llc_ui_autoport(); if (!addr->sllc_sap) goto out; } sap = llc_sap_find(addr->sllc_sap); if (!sap) { sap = llc_sap_open(addr->sllc_sap, NULL); rc = -EBUSY; /* some other network layer is using the sap */ if (!sap) goto out; } else { struct llc_addr laddr, daddr; struct sock *ask; memset(&laddr, 0, sizeof(laddr)); memset(&daddr, 0, sizeof(daddr)); /* * FIXME: check if the address is multicast, * only SOCK_DGRAM can do this. */ memcpy(laddr.mac, addr->sllc_mac, IFHWADDRLEN); laddr.lsap = addr->sllc_sap; rc = -EADDRINUSE; /* mac + sap clash. */ ask = llc_lookup_established(sap, &daddr, &laddr, &init_net); if (ask) { sock_put(ask); goto out_put; } } /* Note: We do not expect errors from this point. */ llc->dev = dev; netdev_tracker_alloc(llc->dev, &llc->dev_tracker, GFP_KERNEL); dev = NULL; llc->laddr.lsap = addr->sllc_sap; memcpy(llc->laddr.mac, addr->sllc_mac, IFHWADDRLEN); memcpy(&llc->addr, addr, sizeof(llc->addr)); /* assign new connection to its SAP */ llc_sap_add_socket(sap, sk); sock_reset_flag(sk, SOCK_ZAPPED); rc = 0; out_put: llc_sap_put(sap); out: dev_put(dev); release_sock(sk); return rc; } /** * llc_ui_shutdown - shutdown a connect llc2 socket. * @sock: Socket to shutdown. * @how: What part of the socket to shutdown. * * Shutdown a connected llc2 socket. Currently this function only supports * shutting down both sends and receives (2), we could probably make this * function such that a user can shutdown only half the connection but not * right now. * Returns: 0 upon success, negative otherwise. */ static int llc_ui_shutdown(struct socket *sock, int how) { struct sock *sk = sock->sk; int rc = -ENOTCONN; lock_sock(sk); if (unlikely(sk->sk_state != TCP_ESTABLISHED)) goto out; rc = -EINVAL; if (how != 2) goto out; rc = llc_send_disc(sk); if (!rc) rc = llc_ui_wait_for_disc(sk, sk->sk_rcvtimeo); /* Wake up anyone sleeping in poll */ sk->sk_state_change(sk); out: release_sock(sk); return rc; } /** * llc_ui_connect - Connect to a remote llc2 mac + sap. * @sock: Socket which will be connected to the remote destination. * @uaddr: Remote and possibly the local address of the new connection. * @addrlen: Size of uaddr structure. * @flags: Operational flags specified by the user. * * Connect to a remote llc2 mac + sap. The caller must specify the * destination mac and address to connect to. If the user hasn't previously * called bind(2) with a smac the address of the first interface of the * specified arp type will be used. * This function will autobind if user did not previously call bind. * Returns: 0 upon success, negative otherwise. */ static int llc_ui_connect(struct socket *sock, struct sockaddr *uaddr, int addrlen, int flags) { struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); struct sockaddr_llc *addr = (struct sockaddr_llc *)uaddr; int rc = -EINVAL; lock_sock(sk); if (unlikely(addrlen != sizeof(*addr))) goto out; rc = -EAFNOSUPPORT; if (unlikely(addr->sllc_family != AF_LLC)) goto out; if (unlikely(sk->sk_type != SOCK_STREAM)) goto out; rc = -EALREADY; if (unlikely(sock->state == SS_CONNECTING)) goto out; /* bind connection to sap if user hasn't done it. */ if (sock_flag(sk, SOCK_ZAPPED)) { /* bind to sap with null dev, exclusive */ rc = llc_ui_autobind(sock, addr); if (rc) goto out; } llc->daddr.lsap = addr->sllc_sap; memcpy(llc->daddr.mac, addr->sllc_mac, IFHWADDRLEN); sock->state = SS_CONNECTING; sk->sk_state = TCP_SYN_SENT; llc->link = llc_ui_next_link_no(llc->sap->laddr.lsap); rc = llc_establish_connection(sk, llc->dev->dev_addr, addr->sllc_mac, addr->sllc_sap); if (rc) { dprintk("%s: llc_ui_send_conn failed :-(\n", __func__); sock->state = SS_UNCONNECTED; sk->sk_state = TCP_CLOSE; goto out; } if (sk->sk_state == TCP_SYN_SENT) { const long timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); if (!timeo || !llc_ui_wait_for_conn(sk, timeo)) goto out; rc = sock_intr_errno(timeo); if (signal_pending(current)) goto out; } if (sk->sk_state == TCP_CLOSE) goto sock_error; sock->state = SS_CONNECTED; rc = 0; out: release_sock(sk); return rc; sock_error: rc = sock_error(sk) ? : -ECONNABORTED; sock->state = SS_UNCONNECTED; goto out; } /** * llc_ui_listen - allow a normal socket to accept incoming connections * @sock: Socket to allow incoming connections on. * @backlog: Number of connections to queue. * * Allow a normal socket to accept incoming connections. * Returns 0 upon success, negative otherwise. */ static int llc_ui_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; int rc = -EINVAL; lock_sock(sk); if (unlikely(sock->state != SS_UNCONNECTED)) goto out; rc = -EOPNOTSUPP; if (unlikely(sk->sk_type != SOCK_STREAM)) goto out; rc = -EAGAIN; if (sock_flag(sk, SOCK_ZAPPED)) goto out; rc = 0; if (!(unsigned int)backlog) /* BSDism */ backlog = 1; sk->sk_max_ack_backlog = backlog; if (sk->sk_state != TCP_LISTEN) { sk->sk_ack_backlog = 0; sk->sk_state = TCP_LISTEN; } sk->sk_socket->flags |= __SO_ACCEPTCON; out: release_sock(sk); return rc; } static int llc_ui_wait_for_disc(struct sock *sk, long timeout) { DEFINE_WAIT_FUNC(wait, woken_wake_function); int rc = 0; add_wait_queue(sk_sleep(sk), &wait); while (1) { if (sk_wait_event(sk, &timeout, READ_ONCE(sk->sk_state) == TCP_CLOSE, &wait)) break; rc = -ERESTARTSYS; if (signal_pending(current)) break; rc = -EAGAIN; if (!timeout) break; rc = 0; } remove_wait_queue(sk_sleep(sk), &wait); return rc; } static bool llc_ui_wait_for_conn(struct sock *sk, long timeout) { DEFINE_WAIT_FUNC(wait, woken_wake_function); add_wait_queue(sk_sleep(sk), &wait); while (1) { if (sk_wait_event(sk, &timeout, READ_ONCE(sk->sk_state) != TCP_SYN_SENT, &wait)) break; if (signal_pending(current) || !timeout) break; } remove_wait_queue(sk_sleep(sk), &wait); return timeout; } static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct llc_sock *llc = llc_sk(sk); int rc; add_wait_queue(sk_sleep(sk), &wait); while (1) { rc = 0; if (sk_wait_event(sk, &timeout, (READ_ONCE(sk->sk_shutdown) & RCV_SHUTDOWN) || (!llc_data_accept_state(llc->state) && !llc->remote_busy_flag && !llc->p_flag), &wait)) break; rc = -ERESTARTSYS; if (signal_pending(current)) break; rc = -EAGAIN; if (!timeout) break; } remove_wait_queue(sk_sleep(sk), &wait); return rc; } static int llc_wait_data(struct sock *sk, long timeo) { int rc; while (1) { /* * POSIX 1003.1g mandates this order. */ rc = sock_error(sk); if (rc) break; rc = 0; if (sk->sk_shutdown & RCV_SHUTDOWN) break; rc = -EAGAIN; if (!timeo) break; rc = sock_intr_errno(timeo); if (signal_pending(current)) break; rc = 0; if (sk_wait_data(sk, &timeo, NULL)) break; } return rc; } static void llc_cmsg_rcv(struct msghdr *msg, struct sk_buff *skb) { struct llc_sock *llc = llc_sk(skb->sk); if (llc->cmsg_flags & LLC_CMSG_PKTINFO) { struct llc_pktinfo info; memset(&info, 0, sizeof(info)); info.lpi_ifindex = llc_sk(skb->sk)->dev->ifindex; llc_pdu_decode_dsap(skb, &info.lpi_sap); llc_pdu_decode_da(skb, info.lpi_mac); put_cmsg(msg, SOL_LLC, LLC_OPT_PKTINFO, sizeof(info), &info); } } /** * llc_ui_accept - accept a new incoming connection. * @sock: Socket which connections arrive on. * @newsock: Socket to move incoming connection to. * @flags: User specified operational flags. * @kern: If the socket is kernel internal * * Accept a new incoming connection. * Returns 0 upon success, negative otherwise. */ static int llc_ui_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { struct sock *sk = sock->sk, *newsk; struct llc_sock *llc, *newllc; struct sk_buff *skb; int rc = -EOPNOTSUPP; dprintk("%s: accepting on %02X\n", __func__, llc_sk(sk)->laddr.lsap); lock_sock(sk); if (unlikely(sk->sk_type != SOCK_STREAM)) goto out; rc = -EINVAL; if (unlikely(sock->state != SS_UNCONNECTED || sk->sk_state != TCP_LISTEN)) goto out; /* wait for a connection to arrive. */ if (skb_queue_empty(&sk->sk_receive_queue)) { rc = llc_wait_data(sk, sk->sk_rcvtimeo); if (rc) goto out; } dprintk("%s: got a new connection on %02X\n", __func__, llc_sk(sk)->laddr.lsap); skb = skb_dequeue(&sk->sk_receive_queue); rc = -EINVAL; if (!skb->sk) goto frees; rc = 0; newsk = skb->sk; /* attach connection to a new socket. */ llc_ui_sk_init(newsock, newsk); sock_reset_flag(newsk, SOCK_ZAPPED); newsk->sk_state = TCP_ESTABLISHED; newsock->state = SS_CONNECTED; llc = llc_sk(sk); newllc = llc_sk(newsk); memcpy(&newllc->addr, &llc->addr, sizeof(newllc->addr)); newllc->link = llc_ui_next_link_no(newllc->laddr.lsap); /* put original socket back into a clean listen state. */ sk->sk_state = TCP_LISTEN; sk_acceptq_removed(sk); dprintk("%s: ok success on %02X, client on %02X\n", __func__, llc_sk(sk)->addr.sllc_sap, newllc->daddr.lsap); frees: kfree_skb(skb); out: release_sock(sk); return rc; } /** * llc_ui_recvmsg - copy received data to the socket user. * @sock: Socket to copy data from. * @msg: Various user space related information. * @len: Size of user buffer. * @flags: User specified flags. * * Copy received data to the socket user. * Returns non-negative upon success, negative otherwise. */ static int llc_ui_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { DECLARE_SOCKADDR(struct sockaddr_llc *, uaddr, msg->msg_name); const int nonblock = flags & MSG_DONTWAIT; struct sk_buff *skb = NULL; struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); size_t copied = 0; u32 peek_seq = 0; u32 *seq, skb_len; unsigned long used; int target; /* Read at least this many bytes */ long timeo; lock_sock(sk); copied = -ENOTCONN; if (unlikely(sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_LISTEN)) goto out; timeo = sock_rcvtimeo(sk, nonblock); seq = &llc->copied_seq; if (flags & MSG_PEEK) { peek_seq = llc->copied_seq; seq = &peek_seq; } target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); copied = 0; do { u32 offset; /* * We need to check signals first, to get correct SIGURG * handling. FIXME: Need to check this doesn't impact 1003.1g * and move it down to the bottom of the loop */ if (signal_pending(current)) { if (copied) break; copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; break; } /* Next get a buffer. */ skb = skb_peek(&sk->sk_receive_queue); if (skb) { offset = *seq; goto found_ok_skb; } /* Well, if we have backlog, try to process it now yet. */ if (copied >= target && !READ_ONCE(sk->sk_backlog.tail)) break; if (copied) { if (sk->sk_err || sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN) || !timeo || (flags & MSG_PEEK)) break; } else { if (sock_flag(sk, SOCK_DONE)) break; if (sk->sk_err) { copied = sock_error(sk); break; } if (sk->sk_shutdown & RCV_SHUTDOWN) break; if (sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_CLOSE) { if (!sock_flag(sk, SOCK_DONE)) { /* * This occurs when user tries to read * from never connected socket. */ copied = -ENOTCONN; break; } break; } if (!timeo) { copied = -EAGAIN; break; } } if (copied >= target) { /* Do not sleep, just process backlog. */ release_sock(sk); lock_sock(sk); } else sk_wait_data(sk, &timeo, NULL); if ((flags & MSG_PEEK) && peek_seq != llc->copied_seq) { net_dbg_ratelimited("LLC(%s:%d): Application bug, race in MSG_PEEK\n", current->comm, task_pid_nr(current)); peek_seq = llc->copied_seq; } continue; found_ok_skb: skb_len = skb->len; /* Ok so how much can we use? */ used = skb->len - offset; if (len < used) used = len; if (!(flags & MSG_TRUNC)) { int rc = skb_copy_datagram_msg(skb, offset, msg, used); if (rc) { /* Exception. Bailout! */ if (!copied) copied = -EFAULT; break; } } *seq += used; copied += used; len -= used; /* For non stream protcols we get one packet per recvmsg call */ if (sk->sk_type != SOCK_STREAM) goto copy_uaddr; if (!(flags & MSG_PEEK)) { skb_unlink(skb, &sk->sk_receive_queue); kfree_skb(skb); *seq = 0; } /* Partial read */ if (used + offset < skb_len) continue; } while (len > 0); out: release_sock(sk); return copied; copy_uaddr: if (uaddr != NULL && skb != NULL) { memcpy(uaddr, llc_ui_skb_cb(skb), sizeof(*uaddr)); msg->msg_namelen = sizeof(*uaddr); } if (llc_sk(sk)->cmsg_flags) llc_cmsg_rcv(msg, skb); if (!(flags & MSG_PEEK)) { skb_unlink(skb, &sk->sk_receive_queue); kfree_skb(skb); *seq = 0; } goto out; } /** * llc_ui_sendmsg - Transmit data provided by the socket user. * @sock: Socket to transmit data from. * @msg: Various user related information. * @len: Length of data to transmit. * * Transmit data provided by the socket user. * Returns non-negative upon success, negative otherwise. */ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); DECLARE_SOCKADDR(struct sockaddr_llc *, addr, msg->msg_name); int flags = msg->msg_flags; int noblock = flags & MSG_DONTWAIT; struct sk_buff *skb = NULL; size_t size = 0; int rc = -EINVAL, copied = 0, hdrlen; dprintk("%s: sending from %02X to %02X\n", __func__, llc->laddr.lsap, llc->daddr.lsap); lock_sock(sk); if (addr) { if (msg->msg_namelen < sizeof(*addr)) goto out; } else { if (llc_ui_addr_null(&llc->addr)) goto out; addr = &llc->addr; } /* must bind connection to sap if user hasn't done it. */ if (sock_flag(sk, SOCK_ZAPPED)) { /* bind to sap with null dev, exclusive. */ rc = llc_ui_autobind(sock, addr); if (rc) goto out; } hdrlen = llc->dev->hard_header_len + llc_ui_header_len(sk, addr); size = hdrlen + len; if (size > llc->dev->mtu) size = llc->dev->mtu; copied = size - hdrlen; rc = -EINVAL; if (copied < 0) goto out; release_sock(sk); skb = sock_alloc_send_skb(sk, size, noblock, &rc); lock_sock(sk); if (!skb) goto out; skb->dev = llc->dev; skb->protocol = llc_proto_type(addr->sllc_arphrd); skb_reserve(skb, hdrlen); rc = memcpy_from_msg(skb_put(skb, copied), msg, copied); if (rc) goto out; if (sk->sk_type == SOCK_DGRAM || addr->sllc_ua) { llc_build_and_send_ui_pkt(llc->sap, skb, addr->sllc_mac, addr->sllc_sap); skb = NULL; goto out; } if (addr->sllc_test) { llc_build_and_send_test_pkt(llc->sap, skb, addr->sllc_mac, addr->sllc_sap); skb = NULL; goto out; } if (addr->sllc_xid) { llc_build_and_send_xid_pkt(llc->sap, skb, addr->sllc_mac, addr->sllc_sap); skb = NULL; goto out; } rc = -ENOPROTOOPT; if (!(sk->sk_type == SOCK_STREAM && !addr->sllc_ua)) goto out; rc = llc_ui_send_data(sk, skb, noblock); skb = NULL; out: kfree_skb(skb); if (rc) dprintk("%s: failed sending from %02X to %02X: %d\n", __func__, llc->laddr.lsap, llc->daddr.lsap, rc); release_sock(sk); return rc ? : copied; } /** * llc_ui_getname - return the address info of a socket * @sock: Socket to get address of. * @uaddr: Address structure to return information. * @peer: Does user want local or remote address information. * * Return the address information of a socket. */ static int llc_ui_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sockaddr_llc sllc; struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); int rc = -EBADF; memset(&sllc, 0, sizeof(sllc)); lock_sock(sk); if (sock_flag(sk, SOCK_ZAPPED)) goto out; if (peer) { rc = -ENOTCONN; if (sk->sk_state != TCP_ESTABLISHED) goto out; if(llc->dev) sllc.sllc_arphrd = llc->dev->type; sllc.sllc_sap = llc->daddr.lsap; memcpy(&sllc.sllc_mac, &llc->daddr.mac, IFHWADDRLEN); } else { rc = -EINVAL; if (!llc->sap) goto out; sllc.sllc_sap = llc->sap->laddr.lsap; if (llc->dev) { sllc.sllc_arphrd = llc->dev->type; memcpy(&sllc.sllc_mac, llc->dev->dev_addr, IFHWADDRLEN); } } sllc.sllc_family = AF_LLC; memcpy(uaddr, &sllc, sizeof(sllc)); rc = sizeof(sllc); out: release_sock(sk); return rc; } /** * llc_ui_ioctl - io controls for PF_LLC * @sock: Socket to get/set info * @cmd: command * @arg: optional argument for cmd * * get/set info on llc sockets */ static int llc_ui_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { return -ENOIOCTLCMD; } /** * llc_ui_setsockopt - set various connection specific parameters. * @sock: Socket to set options on. * @level: Socket level user is requesting operations on. * @optname: Operation name. * @optval: User provided operation data. * @optlen: Length of optval. * * Set various connection specific parameters. */ static int llc_ui_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); unsigned int opt; int rc = -EINVAL; lock_sock(sk); if (unlikely(level != SOL_LLC || optlen != sizeof(int))) goto out; rc = copy_from_sockptr(&opt, optval, sizeof(opt)); if (rc) goto out; rc = -EINVAL; switch (optname) { case LLC_OPT_RETRY: if (opt > LLC_OPT_MAX_RETRY) goto out; llc->n2 = opt; break; case LLC_OPT_SIZE: if (opt > LLC_OPT_MAX_SIZE) goto out; llc->n1 = opt; break; case LLC_OPT_ACK_TMR_EXP: if (opt > LLC_OPT_MAX_ACK_TMR_EXP) goto out; llc->ack_timer.expire = opt * HZ; break; case LLC_OPT_P_TMR_EXP: if (opt > LLC_OPT_MAX_P_TMR_EXP) goto out; llc->pf_cycle_timer.expire = opt * HZ; break; case LLC_OPT_REJ_TMR_EXP: if (opt > LLC_OPT_MAX_REJ_TMR_EXP) goto out; llc->rej_sent_timer.expire = opt * HZ; break; case LLC_OPT_BUSY_TMR_EXP: if (opt > LLC_OPT_MAX_BUSY_TMR_EXP) goto out; llc->busy_state_timer.expire = opt * HZ; break; case LLC_OPT_TX_WIN: if (opt > LLC_OPT_MAX_WIN) goto out; llc->k = opt; break; case LLC_OPT_RX_WIN: if (opt > LLC_OPT_MAX_WIN) goto out; llc->rw = opt; break; case LLC_OPT_PKTINFO: if (opt) llc->cmsg_flags |= LLC_CMSG_PKTINFO; else llc->cmsg_flags &= ~LLC_CMSG_PKTINFO; break; default: rc = -ENOPROTOOPT; goto out; } rc = 0; out: release_sock(sk); return rc; } /** * llc_ui_getsockopt - get connection specific socket info * @sock: Socket to get information from. * @level: Socket level user is requesting operations on. * @optname: Operation name. * @optval: Variable to return operation data in. * @optlen: Length of optval. * * Get connection specific socket information. */ static int llc_ui_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); int val = 0, len = 0, rc = -EINVAL; lock_sock(sk); if (unlikely(level != SOL_LLC)) goto out; rc = get_user(len, optlen); if (rc) goto out; rc = -EINVAL; if (len != sizeof(int)) goto out; switch (optname) { case LLC_OPT_RETRY: val = llc->n2; break; case LLC_OPT_SIZE: val = llc->n1; break; case LLC_OPT_ACK_TMR_EXP: val = llc->ack_timer.expire / HZ; break; case LLC_OPT_P_TMR_EXP: val = llc->pf_cycle_timer.expire / HZ; break; case LLC_OPT_REJ_TMR_EXP: val = llc->rej_sent_timer.expire / HZ; break; case LLC_OPT_BUSY_TMR_EXP: val = llc->busy_state_timer.expire / HZ; break; case LLC_OPT_TX_WIN: val = llc->k; break; case LLC_OPT_RX_WIN: val = llc->rw; break; case LLC_OPT_PKTINFO: val = (llc->cmsg_flags & LLC_CMSG_PKTINFO) != 0; break; default: rc = -ENOPROTOOPT; goto out; } rc = 0; if (put_user(len, optlen) || copy_to_user(optval, &val, len)) rc = -EFAULT; out: release_sock(sk); return rc; } static const struct net_proto_family llc_ui_family_ops = { .family = PF_LLC, .create = llc_ui_create, .owner = THIS_MODULE, }; static const struct proto_ops llc_ui_ops = { .family = PF_LLC, .owner = THIS_MODULE, .release = llc_ui_release, .bind = llc_ui_bind, .connect = llc_ui_connect, .socketpair = sock_no_socketpair, .accept = llc_ui_accept, .getname = llc_ui_getname, .poll = datagram_poll, .ioctl = llc_ui_ioctl, .listen = llc_ui_listen, .shutdown = llc_ui_shutdown, .setsockopt = llc_ui_setsockopt, .getsockopt = llc_ui_getsockopt, .sendmsg = llc_ui_sendmsg, .recvmsg = llc_ui_recvmsg, .mmap = sock_no_mmap, }; static const char llc_proc_err_msg[] __initconst = KERN_CRIT "LLC: Unable to register the proc_fs entries\n"; static const char llc_sysctl_err_msg[] __initconst = KERN_CRIT "LLC: Unable to register the sysctl entries\n"; static const char llc_sock_err_msg[] __initconst = KERN_CRIT "LLC: Unable to register the network family\n"; static int __init llc2_init(void) { int rc = proto_register(&llc_proto, 0); if (rc != 0) goto out; llc_build_offset_table(); llc_station_init(); llc_ui_sap_last_autoport = LLC_SAP_DYN_START; rc = llc_proc_init(); if (rc != 0) { printk(llc_proc_err_msg); goto out_station; } rc = llc_sysctl_init(); if (rc) { printk(llc_sysctl_err_msg); goto out_proc; } rc = sock_register(&llc_ui_family_ops); if (rc) { printk(llc_sock_err_msg); goto out_sysctl; } llc_add_pack(LLC_DEST_SAP, llc_sap_handler); llc_add_pack(LLC_DEST_CONN, llc_conn_handler); out: return rc; out_sysctl: llc_sysctl_exit(); out_proc: llc_proc_exit(); out_station: llc_station_exit(); proto_unregister(&llc_proto); goto out; } static void __exit llc2_exit(void) { llc_station_exit(); llc_remove_pack(LLC_DEST_SAP); llc_remove_pack(LLC_DEST_CONN); sock_unregister(PF_LLC); llc_proc_exit(); llc_sysctl_exit(); proto_unregister(&llc_proto); } module_init(llc2_init); module_exit(llc2_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Procom 1997, Jay Schullist 2001, Arnaldo C. Melo 2001-2003"); MODULE_DESCRIPTION("IEEE 802.2 PF_LLC support"); MODULE_ALIAS_NETPROTO(PF_LLC); |
1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 | // SPDX-License-Identifier: GPL-2.0-only /* * IEEE 802.1Q Multiple Registration Protocol (MRP) * * Copyright (c) 2012 Massachusetts Institute of Technology * * Adapted from code in net/802/garp.c * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> */ #include <linux/kernel.h> #include <linux/timer.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <linux/module.h> #include <net/mrp.h> #include <asm/unaligned.h> static unsigned int mrp_join_time __read_mostly = 200; module_param(mrp_join_time, uint, 0644); MODULE_PARM_DESC(mrp_join_time, "Join time in ms (default 200ms)"); static unsigned int mrp_periodic_time __read_mostly = 1000; module_param(mrp_periodic_time, uint, 0644); MODULE_PARM_DESC(mrp_periodic_time, "Periodic time in ms (default 1s)"); MODULE_LICENSE("GPL"); static const u8 mrp_applicant_state_table[MRP_APPLICANT_MAX + 1][MRP_EVENT_MAX + 1] = { [MRP_APPLICANT_VO] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_VP, [MRP_EVENT_LV] = MRP_APPLICANT_VO, [MRP_EVENT_TX] = MRP_APPLICANT_VO, [MRP_EVENT_R_NEW] = MRP_APPLICANT_VO, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_AO, [MRP_EVENT_R_IN] = MRP_APPLICANT_VO, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_VO, [MRP_EVENT_R_MT] = MRP_APPLICANT_VO, [MRP_EVENT_R_LV] = MRP_APPLICANT_VO, [MRP_EVENT_R_LA] = MRP_APPLICANT_VO, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VO, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_VO, }, [MRP_APPLICANT_VP] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_VP, [MRP_EVENT_LV] = MRP_APPLICANT_VO, [MRP_EVENT_TX] = MRP_APPLICANT_AA, [MRP_EVENT_R_NEW] = MRP_APPLICANT_VP, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_AP, [MRP_EVENT_R_IN] = MRP_APPLICANT_VP, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_VP, [MRP_EVENT_R_MT] = MRP_APPLICANT_VP, [MRP_EVENT_R_LV] = MRP_APPLICANT_VP, [MRP_EVENT_R_LA] = MRP_APPLICANT_VP, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VP, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_VP, }, [MRP_APPLICANT_VN] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_VN, [MRP_EVENT_LV] = MRP_APPLICANT_LA, [MRP_EVENT_TX] = MRP_APPLICANT_AN, [MRP_EVENT_R_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_VN, [MRP_EVENT_R_IN] = MRP_APPLICANT_VN, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_VN, [MRP_EVENT_R_MT] = MRP_APPLICANT_VN, [MRP_EVENT_R_LV] = MRP_APPLICANT_VN, [MRP_EVENT_R_LA] = MRP_APPLICANT_VN, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VN, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_VN, }, [MRP_APPLICANT_AN] = { [MRP_EVENT_NEW] = MRP_APPLICANT_AN, [MRP_EVENT_JOIN] = MRP_APPLICANT_AN, [MRP_EVENT_LV] = MRP_APPLICANT_LA, [MRP_EVENT_TX] = MRP_APPLICANT_QA, [MRP_EVENT_R_NEW] = MRP_APPLICANT_AN, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_AN, [MRP_EVENT_R_IN] = MRP_APPLICANT_AN, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AN, [MRP_EVENT_R_MT] = MRP_APPLICANT_AN, [MRP_EVENT_R_LV] = MRP_APPLICANT_VN, [MRP_EVENT_R_LA] = MRP_APPLICANT_VN, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VN, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AN, }, [MRP_APPLICANT_AA] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_AA, [MRP_EVENT_LV] = MRP_APPLICANT_LA, [MRP_EVENT_TX] = MRP_APPLICANT_QA, [MRP_EVENT_R_NEW] = MRP_APPLICANT_AA, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QA, [MRP_EVENT_R_IN] = MRP_APPLICANT_AA, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AA, [MRP_EVENT_R_MT] = MRP_APPLICANT_AA, [MRP_EVENT_R_LV] = MRP_APPLICANT_VP, [MRP_EVENT_R_LA] = MRP_APPLICANT_VP, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VP, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AA, }, [MRP_APPLICANT_QA] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_QA, [MRP_EVENT_LV] = MRP_APPLICANT_LA, [MRP_EVENT_TX] = MRP_APPLICANT_QA, [MRP_EVENT_R_NEW] = MRP_APPLICANT_QA, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QA, [MRP_EVENT_R_IN] = MRP_APPLICANT_QA, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AA, [MRP_EVENT_R_MT] = MRP_APPLICANT_AA, [MRP_EVENT_R_LV] = MRP_APPLICANT_VP, [MRP_EVENT_R_LA] = MRP_APPLICANT_VP, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VP, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AA, }, [MRP_APPLICANT_LA] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_AA, [MRP_EVENT_LV] = MRP_APPLICANT_LA, [MRP_EVENT_TX] = MRP_APPLICANT_VO, [MRP_EVENT_R_NEW] = MRP_APPLICANT_LA, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_LA, [MRP_EVENT_R_IN] = MRP_APPLICANT_LA, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_LA, [MRP_EVENT_R_MT] = MRP_APPLICANT_LA, [MRP_EVENT_R_LV] = MRP_APPLICANT_LA, [MRP_EVENT_R_LA] = MRP_APPLICANT_LA, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_LA, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_LA, }, [MRP_APPLICANT_AO] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_AP, [MRP_EVENT_LV] = MRP_APPLICANT_AO, [MRP_EVENT_TX] = MRP_APPLICANT_AO, [MRP_EVENT_R_NEW] = MRP_APPLICANT_AO, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QO, [MRP_EVENT_R_IN] = MRP_APPLICANT_AO, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AO, [MRP_EVENT_R_MT] = MRP_APPLICANT_AO, [MRP_EVENT_R_LV] = MRP_APPLICANT_VO, [MRP_EVENT_R_LA] = MRP_APPLICANT_VO, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VO, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AO, }, [MRP_APPLICANT_QO] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_QP, [MRP_EVENT_LV] = MRP_APPLICANT_QO, [MRP_EVENT_TX] = MRP_APPLICANT_QO, [MRP_EVENT_R_NEW] = MRP_APPLICANT_QO, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QO, [MRP_EVENT_R_IN] = MRP_APPLICANT_QO, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AO, [MRP_EVENT_R_MT] = MRP_APPLICANT_AO, [MRP_EVENT_R_LV] = MRP_APPLICANT_VO, [MRP_EVENT_R_LA] = MRP_APPLICANT_VO, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VO, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_QO, }, [MRP_APPLICANT_AP] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_AP, [MRP_EVENT_LV] = MRP_APPLICANT_AO, [MRP_EVENT_TX] = MRP_APPLICANT_QA, [MRP_EVENT_R_NEW] = MRP_APPLICANT_AP, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QP, [MRP_EVENT_R_IN] = MRP_APPLICANT_AP, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AP, [MRP_EVENT_R_MT] = MRP_APPLICANT_AP, [MRP_EVENT_R_LV] = MRP_APPLICANT_VP, [MRP_EVENT_R_LA] = MRP_APPLICANT_VP, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VP, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AP, }, [MRP_APPLICANT_QP] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_QP, [MRP_EVENT_LV] = MRP_APPLICANT_QO, [MRP_EVENT_TX] = MRP_APPLICANT_QP, [MRP_EVENT_R_NEW] = MRP_APPLICANT_QP, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QP, [MRP_EVENT_R_IN] = MRP_APPLICANT_QP, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AP, [MRP_EVENT_R_MT] = MRP_APPLICANT_AP, [MRP_EVENT_R_LV] = MRP_APPLICANT_VP, [MRP_EVENT_R_LA] = MRP_APPLICANT_VP, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VP, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AP, }, }; static const u8 mrp_tx_action_table[MRP_APPLICANT_MAX + 1] = { [MRP_APPLICANT_VO] = MRP_TX_ACTION_S_IN_OPTIONAL, [MRP_APPLICANT_VP] = MRP_TX_ACTION_S_JOIN_IN, [MRP_APPLICANT_VN] = MRP_TX_ACTION_S_NEW, [MRP_APPLICANT_AN] = MRP_TX_ACTION_S_NEW, [MRP_APPLICANT_AA] = MRP_TX_ACTION_S_JOIN_IN, [MRP_APPLICANT_QA] = MRP_TX_ACTION_S_JOIN_IN_OPTIONAL, [MRP_APPLICANT_LA] = MRP_TX_ACTION_S_LV, [MRP_APPLICANT_AO] = MRP_TX_ACTION_S_IN_OPTIONAL, [MRP_APPLICANT_QO] = MRP_TX_ACTION_S_IN_OPTIONAL, [MRP_APPLICANT_AP] = MRP_TX_ACTION_S_JOIN_IN, [MRP_APPLICANT_QP] = MRP_TX_ACTION_S_IN_OPTIONAL, }; static void mrp_attrvalue_inc(void *value, u8 len) { u8 *v = (u8 *)value; /* Add 1 to the last byte. If it becomes zero, * go to the previous byte and repeat. */ while (len > 0 && !++v[--len]) ; } static int mrp_attr_cmp(const struct mrp_attr *attr, const void *value, u8 len, u8 type) { if (attr->type != type) return attr->type - type; if (attr->len != len) return attr->len - len; return memcmp(attr->value, value, len); } static struct mrp_attr *mrp_attr_lookup(const struct mrp_applicant *app, const void *value, u8 len, u8 type) { struct rb_node *parent = app->mad.rb_node; struct mrp_attr *attr; int d; while (parent) { attr = rb_entry(parent, struct mrp_attr, node); d = mrp_attr_cmp(attr, value, len, type); if (d > 0) parent = parent->rb_left; else if (d < 0) parent = parent->rb_right; else return attr; } return NULL; } static struct mrp_attr *mrp_attr_create(struct mrp_applicant *app, const void *value, u8 len, u8 type) { struct rb_node *parent = NULL, **p = &app->mad.rb_node; struct mrp_attr *attr; int d; while (*p) { parent = *p; attr = rb_entry(parent, struct mrp_attr, node); d = mrp_attr_cmp(attr, value, len, type); if (d > 0) p = &parent->rb_left; else if (d < 0) p = &parent->rb_right; else { /* The attribute already exists; re-use it. */ return attr; } } attr = kmalloc(sizeof(*attr) + len, GFP_ATOMIC); if (!attr) return attr; attr->state = MRP_APPLICANT_VO; attr->type = type; attr->len = len; memcpy(attr->value, value, len); rb_link_node(&attr->node, parent, p); rb_insert_color(&attr->node, &app->mad); return attr; } static void mrp_attr_destroy(struct mrp_applicant *app, struct mrp_attr *attr) { rb_erase(&attr->node, &app->mad); kfree(attr); } static void mrp_attr_destroy_all(struct mrp_applicant *app) { struct rb_node *node, *next; struct mrp_attr *attr; for (node = rb_first(&app->mad); next = node ? rb_next(node) : NULL, node != NULL; node = next) { attr = rb_entry(node, struct mrp_attr, node); mrp_attr_destroy(app, attr); } } static int mrp_pdu_init(struct mrp_applicant *app) { struct sk_buff *skb; struct mrp_pdu_hdr *ph; skb = alloc_skb(app->dev->mtu + LL_RESERVED_SPACE(app->dev), GFP_ATOMIC); if (!skb) return -ENOMEM; skb->dev = app->dev; skb->protocol = app->app->pkttype.type; skb_reserve(skb, LL_RESERVED_SPACE(app->dev)); skb_reset_network_header(skb); skb_reset_transport_header(skb); ph = __skb_put(skb, sizeof(*ph)); ph->version = app->app->version; app->pdu = skb; return 0; } static int mrp_pdu_append_end_mark(struct mrp_applicant *app) { __be16 *endmark; if (skb_tailroom(app->pdu) < sizeof(*endmark)) return -1; endmark = __skb_put(app->pdu, sizeof(*endmark)); put_unaligned(MRP_END_MARK, endmark); return 0; } static void mrp_pdu_queue(struct mrp_applicant *app) { if (!app->pdu) return; if (mrp_cb(app->pdu)->mh) mrp_pdu_append_end_mark(app); mrp_pdu_append_end_mark(app); dev_hard_header(app->pdu, app->dev, ntohs(app->app->pkttype.type), app->app->group_address, app->dev->dev_addr, app->pdu->len); skb_queue_tail(&app->queue, app->pdu); app->pdu = NULL; } static void mrp_queue_xmit(struct mrp_applicant *app) { struct sk_buff *skb; while ((skb = skb_dequeue(&app->queue))) dev_queue_xmit(skb); } static int mrp_pdu_append_msg_hdr(struct mrp_applicant *app, u8 attrtype, u8 attrlen) { struct mrp_msg_hdr *mh; if (mrp_cb(app->pdu)->mh) { if (mrp_pdu_append_end_mark(app) < 0) return -1; mrp_cb(app->pdu)->mh = NULL; mrp_cb(app->pdu)->vah = NULL; } if (skb_tailroom(app->pdu) < sizeof(*mh)) return -1; mh = __skb_put(app->pdu, sizeof(*mh)); mh->attrtype = attrtype; mh->attrlen = attrlen; mrp_cb(app->pdu)->mh = mh; return 0; } static int mrp_pdu_append_vecattr_hdr(struct mrp_applicant *app, const void *firstattrvalue, u8 attrlen) { struct mrp_vecattr_hdr *vah; if (skb_tailroom(app->pdu) < sizeof(*vah) + attrlen) return -1; vah = __skb_put(app->pdu, sizeof(*vah) + attrlen); put_unaligned(0, &vah->lenflags); memcpy(vah->firstattrvalue, firstattrvalue, attrlen); mrp_cb(app->pdu)->vah = vah; memcpy(mrp_cb(app->pdu)->attrvalue, firstattrvalue, attrlen); return 0; } static int mrp_pdu_append_vecattr_event(struct mrp_applicant *app, const struct mrp_attr *attr, enum mrp_vecattr_event vaevent) { u16 len, pos; u8 *vaevents; int err; again: if (!app->pdu) { err = mrp_pdu_init(app); if (err < 0) return err; } /* If there is no Message header in the PDU, or the Message header is * for a different attribute type, add an EndMark (if necessary) and a * new Message header to the PDU. */ if (!mrp_cb(app->pdu)->mh || mrp_cb(app->pdu)->mh->attrtype != attr->type || mrp_cb(app->pdu)->mh->attrlen != attr->len) { if (mrp_pdu_append_msg_hdr(app, attr->type, attr->len) < 0) goto queue; } /* If there is no VectorAttribute header for this Message in the PDU, * or this attribute's value does not sequentially follow the previous * attribute's value, add a new VectorAttribute header to the PDU. */ if (!mrp_cb(app->pdu)->vah || memcmp(mrp_cb(app->pdu)->attrvalue, attr->value, attr->len)) { if (mrp_pdu_append_vecattr_hdr(app, attr->value, attr->len) < 0) goto queue; } len = be16_to_cpu(get_unaligned(&mrp_cb(app->pdu)->vah->lenflags)); pos = len % 3; /* Events are packed into Vectors in the PDU, three to a byte. Add a * byte to the end of the Vector if necessary. */ if (!pos) { if (skb_tailroom(app->pdu) < sizeof(u8)) goto queue; vaevents = __skb_put(app->pdu, sizeof(u8)); } else { vaevents = (u8 *)(skb_tail_pointer(app->pdu) - sizeof(u8)); } switch (pos) { case 0: *vaevents = vaevent * (__MRP_VECATTR_EVENT_MAX * __MRP_VECATTR_EVENT_MAX); break; case 1: *vaevents += vaevent * __MRP_VECATTR_EVENT_MAX; break; case 2: *vaevents += vaevent; break; default: WARN_ON(1); } /* Increment the length of the VectorAttribute in the PDU, as well as * the value of the next attribute that would continue its Vector. */ put_unaligned(cpu_to_be16(++len), &mrp_cb(app->pdu)->vah->lenflags); mrp_attrvalue_inc(mrp_cb(app->pdu)->attrvalue, attr->len); return 0; queue: mrp_pdu_queue(app); goto again; } static void mrp_attr_event(struct mrp_applicant *app, struct mrp_attr *attr, enum mrp_event event) { enum mrp_applicant_state state; state = mrp_applicant_state_table[attr->state][event]; if (state == MRP_APPLICANT_INVALID) { WARN_ON(1); return; } if (event == MRP_EVENT_TX) { /* When appending the attribute fails, don't update its state * in order to retry at the next TX event. */ switch (mrp_tx_action_table[attr->state]) { case MRP_TX_ACTION_NONE: case MRP_TX_ACTION_S_JOIN_IN_OPTIONAL: case MRP_TX_ACTION_S_IN_OPTIONAL: break; case MRP_TX_ACTION_S_NEW: if (mrp_pdu_append_vecattr_event( app, attr, MRP_VECATTR_EVENT_NEW) < 0) return; break; case MRP_TX_ACTION_S_JOIN_IN: if (mrp_pdu_append_vecattr_event( app, attr, MRP_VECATTR_EVENT_JOIN_IN) < 0) return; break; case MRP_TX_ACTION_S_LV: if (mrp_pdu_append_vecattr_event( app, attr, MRP_VECATTR_EVENT_LV) < 0) return; /* As a pure applicant, sending a leave message * implies that the attribute was unregistered and * can be destroyed. */ mrp_attr_destroy(app, attr); return; default: WARN_ON(1); } } attr->state = state; } int mrp_request_join(const struct net_device *dev, const struct mrp_application *appl, const void *value, u8 len, u8 type) { struct mrp_port *port = rtnl_dereference(dev->mrp_port); struct mrp_applicant *app = rtnl_dereference( port->applicants[appl->type]); struct mrp_attr *attr; if (sizeof(struct mrp_skb_cb) + len > sizeof_field(struct sk_buff, cb)) return -ENOMEM; spin_lock_bh(&app->lock); attr = mrp_attr_create(app, value, len, type); if (!attr) { spin_unlock_bh(&app->lock); return -ENOMEM; } mrp_attr_event(app, attr, MRP_EVENT_JOIN); spin_unlock_bh(&app->lock); return 0; } EXPORT_SYMBOL_GPL(mrp_request_join); void mrp_request_leave(const struct net_device *dev, const struct mrp_application *appl, const void *value, u8 len, u8 type) { struct mrp_port *port = rtnl_dereference(dev->mrp_port); struct mrp_applicant *app = rtnl_dereference( port->applicants[appl->type]); struct mrp_attr *attr; if (sizeof(struct mrp_skb_cb) + len > sizeof_field(struct sk_buff, cb)) return; spin_lock_bh(&app->lock); attr = mrp_attr_lookup(app, value, len, type); if (!attr) { spin_unlock_bh(&app->lock); return; } mrp_attr_event(app, attr, MRP_EVENT_LV); spin_unlock_bh(&app->lock); } EXPORT_SYMBOL_GPL(mrp_request_leave); static void mrp_mad_event(struct mrp_applicant *app, enum mrp_event event) { struct rb_node *node, *next; struct mrp_attr *attr; for (node = rb_first(&app->mad); next = node ? rb_next(node) : NULL, node != NULL; node = next) { attr = rb_entry(node, struct mrp_attr, node); mrp_attr_event(app, attr, event); } } static void mrp_join_timer_arm(struct mrp_applicant *app) { unsigned long delay; delay = get_random_u32_below(msecs_to_jiffies(mrp_join_time)); mod_timer(&app->join_timer, jiffies + delay); } static void mrp_join_timer(struct timer_list *t) { struct mrp_applicant *app = from_timer(app, t, join_timer); spin_lock(&app->lock); mrp_mad_event(app, MRP_EVENT_TX); mrp_pdu_queue(app); spin_unlock(&app->lock); mrp_queue_xmit(app); spin_lock(&app->lock); if (likely(app->active)) mrp_join_timer_arm(app); spin_unlock(&app->lock); } static void mrp_periodic_timer_arm(struct mrp_applicant *app) { mod_timer(&app->periodic_timer, jiffies + msecs_to_jiffies(mrp_periodic_time)); } static void mrp_periodic_timer(struct timer_list *t) { struct mrp_applicant *app = from_timer(app, t, periodic_timer); spin_lock(&app->lock); if (likely(app->active)) { mrp_mad_event(app, MRP_EVENT_PERIODIC); mrp_pdu_queue(app); mrp_periodic_timer_arm(app); } spin_unlock(&app->lock); } static int mrp_pdu_parse_end_mark(struct sk_buff *skb, int *offset) { __be16 endmark; if (skb_copy_bits(skb, *offset, &endmark, sizeof(endmark)) < 0) return -1; if (endmark == MRP_END_MARK) { *offset += sizeof(endmark); return -1; } return 0; } static void mrp_pdu_parse_vecattr_event(struct mrp_applicant *app, struct sk_buff *skb, enum mrp_vecattr_event vaevent) { struct mrp_attr *attr; enum mrp_event event; attr = mrp_attr_lookup(app, mrp_cb(skb)->attrvalue, mrp_cb(skb)->mh->attrlen, mrp_cb(skb)->mh->attrtype); if (attr == NULL) return; switch (vaevent) { case MRP_VECATTR_EVENT_NEW: event = MRP_EVENT_R_NEW; break; case MRP_VECATTR_EVENT_JOIN_IN: event = MRP_EVENT_R_JOIN_IN; break; case MRP_VECATTR_EVENT_IN: event = MRP_EVENT_R_IN; break; case MRP_VECATTR_EVENT_JOIN_MT: event = MRP_EVENT_R_JOIN_MT; break; case MRP_VECATTR_EVENT_MT: event = MRP_EVENT_R_MT; break; case MRP_VECATTR_EVENT_LV: event = MRP_EVENT_R_LV; break; default: return; } mrp_attr_event(app, attr, event); } static int mrp_pdu_parse_vecattr(struct mrp_applicant *app, struct sk_buff *skb, int *offset) { struct mrp_vecattr_hdr _vah; u16 valen; u8 vaevents, vaevent; mrp_cb(skb)->vah = skb_header_pointer(skb, *offset, sizeof(_vah), &_vah); if (!mrp_cb(skb)->vah) return -1; *offset += sizeof(_vah); if (get_unaligned(&mrp_cb(skb)->vah->lenflags) & MRP_VECATTR_HDR_FLAG_LA) mrp_mad_event(app, MRP_EVENT_R_LA); valen = be16_to_cpu(get_unaligned(&mrp_cb(skb)->vah->lenflags) & MRP_VECATTR_HDR_LEN_MASK); /* The VectorAttribute structure in a PDU carries event information * about one or more attributes having consecutive values. Only the * value for the first attribute is contained in the structure. So * we make a copy of that value, and then increment it each time we * advance to the next event in its Vector. */ if (sizeof(struct mrp_skb_cb) + mrp_cb(skb)->mh->attrlen > sizeof_field(struct sk_buff, cb)) return -1; if (skb_copy_bits(skb, *offset, mrp_cb(skb)->attrvalue, mrp_cb(skb)->mh->attrlen) < 0) return -1; *offset += mrp_cb(skb)->mh->attrlen; /* In a VectorAttribute, the Vector contains events which are packed * three to a byte. We process one byte of the Vector at a time. */ while (valen > 0) { if (skb_copy_bits(skb, *offset, &vaevents, sizeof(vaevents)) < 0) return -1; *offset += sizeof(vaevents); /* Extract and process the first event. */ vaevent = vaevents / (__MRP_VECATTR_EVENT_MAX * __MRP_VECATTR_EVENT_MAX); if (vaevent >= __MRP_VECATTR_EVENT_MAX) { /* The byte is malformed; stop processing. */ return -1; } mrp_pdu_parse_vecattr_event(app, skb, vaevent); /* If present, extract and process the second event. */ if (!--valen) break; mrp_attrvalue_inc(mrp_cb(skb)->attrvalue, mrp_cb(skb)->mh->attrlen); vaevents %= (__MRP_VECATTR_EVENT_MAX * __MRP_VECATTR_EVENT_MAX); vaevent = vaevents / __MRP_VECATTR_EVENT_MAX; mrp_pdu_parse_vecattr_event(app, skb, vaevent); /* If present, extract and process the third event. */ if (!--valen) break; mrp_attrvalue_inc(mrp_cb(skb)->attrvalue, mrp_cb(skb)->mh->attrlen); vaevents %= __MRP_VECATTR_EVENT_MAX; vaevent = vaevents; mrp_pdu_parse_vecattr_event(app, skb, vaevent); } return 0; } static int mrp_pdu_parse_msg(struct mrp_applicant *app, struct sk_buff *skb, int *offset) { struct mrp_msg_hdr _mh; mrp_cb(skb)->mh = skb_header_pointer(skb, *offset, sizeof(_mh), &_mh); if (!mrp_cb(skb)->mh) return -1; *offset += sizeof(_mh); if (mrp_cb(skb)->mh->attrtype == 0 || mrp_cb(skb)->mh->attrtype > app->app->maxattr || mrp_cb(skb)->mh->attrlen == 0) return -1; while (skb->len > *offset) { if (mrp_pdu_parse_end_mark(skb, offset) < 0) break; if (mrp_pdu_parse_vecattr(app, skb, offset) < 0) return -1; } return 0; } static int mrp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct mrp_application *appl = container_of(pt, struct mrp_application, pkttype); struct mrp_port *port; struct mrp_applicant *app; struct mrp_pdu_hdr _ph; const struct mrp_pdu_hdr *ph; int offset = skb_network_offset(skb); /* If the interface is in promiscuous mode, drop the packet if * it was unicast to another host. */ if (unlikely(skb->pkt_type == PACKET_OTHERHOST)) goto out; skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) goto out; port = rcu_dereference(dev->mrp_port); if (unlikely(!port)) goto out; app = rcu_dereference(port->applicants[appl->type]); if (unlikely(!app)) goto out; ph = skb_header_pointer(skb, offset, sizeof(_ph), &_ph); if (!ph) goto out; offset += sizeof(_ph); if (ph->version != app->app->version) goto out; spin_lock(&app->lock); while (skb->len > offset) { if (mrp_pdu_parse_end_mark(skb, &offset) < 0) break; if (mrp_pdu_parse_msg(app, skb, &offset) < 0) break; } spin_unlock(&app->lock); out: kfree_skb(skb); return 0; } static int mrp_init_port(struct net_device *dev) { struct mrp_port *port; port = kzalloc(sizeof(*port), GFP_KERNEL); if (!port) return -ENOMEM; rcu_assign_pointer(dev->mrp_port, port); return 0; } static void mrp_release_port(struct net_device *dev) { struct mrp_port *port = rtnl_dereference(dev->mrp_port); unsigned int i; for (i = 0; i <= MRP_APPLICATION_MAX; i++) { if (rtnl_dereference(port->applicants[i])) return; } RCU_INIT_POINTER(dev->mrp_port, NULL); kfree_rcu(port, rcu); } int mrp_init_applicant(struct net_device *dev, struct mrp_application *appl) { struct mrp_applicant *app; int err; ASSERT_RTNL(); if (!rtnl_dereference(dev->mrp_port)) { err = mrp_init_port(dev); if (err < 0) goto err1; } err = -ENOMEM; app = kzalloc(sizeof(*app), GFP_KERNEL); if (!app) goto err2; err = dev_mc_add(dev, appl->group_address); if (err < 0) goto err3; app->dev = dev; app->app = appl; app->mad = RB_ROOT; app->active = true; spin_lock_init(&app->lock); skb_queue_head_init(&app->queue); rcu_assign_pointer(dev->mrp_port->applicants[appl->type], app); timer_setup(&app->join_timer, mrp_join_timer, 0); mrp_join_timer_arm(app); timer_setup(&app->periodic_timer, mrp_periodic_timer, 0); mrp_periodic_timer_arm(app); return 0; err3: kfree(app); err2: mrp_release_port(dev); err1: return err; } EXPORT_SYMBOL_GPL(mrp_init_applicant); void mrp_uninit_applicant(struct net_device *dev, struct mrp_application *appl) { struct mrp_port *port = rtnl_dereference(dev->mrp_port); struct mrp_applicant *app = rtnl_dereference( port->applicants[appl->type]); ASSERT_RTNL(); RCU_INIT_POINTER(port->applicants[appl->type], NULL); spin_lock_bh(&app->lock); app->active = false; spin_unlock_bh(&app->lock); /* Delete timer and generate a final TX event to flush out * all pending messages before the applicant is gone. */ timer_shutdown_sync(&app->join_timer); timer_shutdown_sync(&app->periodic_timer); spin_lock_bh(&app->lock); mrp_mad_event(app, MRP_EVENT_TX); mrp_attr_destroy_all(app); mrp_pdu_queue(app); spin_unlock_bh(&app->lock); mrp_queue_xmit(app); dev_mc_del(dev, appl->group_address); kfree_rcu(app, rcu); mrp_release_port(dev); } EXPORT_SYMBOL_GPL(mrp_uninit_applicant); int mrp_register_application(struct mrp_application *appl) { appl->pkttype.func = mrp_rcv; dev_add_pack(&appl->pkttype); return 0; } EXPORT_SYMBOL_GPL(mrp_register_application); void mrp_unregister_application(struct mrp_application *appl) { dev_remove_pack(&appl->pkttype); } EXPORT_SYMBOL_GPL(mrp_unregister_application); |
8497 8499 8496 2 8494 8486 26 8497 8500 8498 2 8495 8486 26 8479 6454 6454 6454 6454 6323 3 14 14 12 8 14 14 12 11 11 11 12 2 14 14 14 8475 8474 8477 4230 2726 6465 14 14 6454 2741 8477 8474 8474 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 | // SPDX-License-Identifier: GPL-2.0 /* * security/tomoyo/realpath.c * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #include "common.h" #include <linux/magic.h> #include <linux/proc_fs.h> /** * tomoyo_encode2 - Encode binary string to ascii string. * * @str: String in binary format. * @str_len: Size of @str in byte. * * Returns pointer to @str in ascii format on success, NULL otherwise. * * This function uses kzalloc(), so caller must kfree() if this function * didn't return NULL. */ char *tomoyo_encode2(const char *str, int str_len) { int i; int len = 0; const char *p = str; char *cp; char *cp0; if (!p) return NULL; for (i = 0; i < str_len; i++) { const unsigned char c = p[i]; if (c == '\\') len += 2; else if (c > ' ' && c < 127) len++; else len += 4; } len++; /* Reserve space for appending "/". */ cp = kzalloc(len + 10, GFP_NOFS); if (!cp) return NULL; cp0 = cp; p = str; for (i = 0; i < str_len; i++) { const unsigned char c = p[i]; if (c == '\\') { *cp++ = '\\'; *cp++ = '\\'; } else if (c > ' ' && c < 127) { *cp++ = c; } else { *cp++ = '\\'; *cp++ = (c >> 6) + '0'; *cp++ = ((c >> 3) & 7) + '0'; *cp++ = (c & 7) + '0'; } } return cp0; } /** * tomoyo_encode - Encode binary string to ascii string. * * @str: String in binary format. * * Returns pointer to @str in ascii format on success, NULL otherwise. * * This function uses kzalloc(), so caller must kfree() if this function * didn't return NULL. */ char *tomoyo_encode(const char *str) { return str ? tomoyo_encode2(str, strlen(str)) : NULL; } /** * tomoyo_get_absolute_path - Get the path of a dentry but ignores chroot'ed root. * * @path: Pointer to "struct path". * @buffer: Pointer to buffer to return value in. * @buflen: Sizeof @buffer. * * Returns the buffer on success, an error code otherwise. * * If dentry is a directory, trailing '/' is appended. */ static char *tomoyo_get_absolute_path(const struct path *path, char * const buffer, const int buflen) { char *pos = ERR_PTR(-ENOMEM); if (buflen >= 256) { /* go to whatever namespace root we are under */ pos = d_absolute_path(path, buffer, buflen - 1); if (!IS_ERR(pos) && *pos == '/' && pos[1]) { struct inode *inode = d_backing_inode(path->dentry); if (inode && S_ISDIR(inode->i_mode)) { buffer[buflen - 2] = '/'; buffer[buflen - 1] = '\0'; } } } return pos; } /** * tomoyo_get_dentry_path - Get the path of a dentry. * * @dentry: Pointer to "struct dentry". * @buffer: Pointer to buffer to return value in. * @buflen: Sizeof @buffer. * * Returns the buffer on success, an error code otherwise. * * If dentry is a directory, trailing '/' is appended. */ static char *tomoyo_get_dentry_path(struct dentry *dentry, char * const buffer, const int buflen) { char *pos = ERR_PTR(-ENOMEM); if (buflen >= 256) { pos = dentry_path_raw(dentry, buffer, buflen - 1); if (!IS_ERR(pos) && *pos == '/' && pos[1]) { struct inode *inode = d_backing_inode(dentry); if (inode && S_ISDIR(inode->i_mode)) { buffer[buflen - 2] = '/'; buffer[buflen - 1] = '\0'; } } } return pos; } /** * tomoyo_get_local_path - Get the path of a dentry. * * @dentry: Pointer to "struct dentry". * @buffer: Pointer to buffer to return value in. * @buflen: Sizeof @buffer. * * Returns the buffer on success, an error code otherwise. */ static char *tomoyo_get_local_path(struct dentry *dentry, char * const buffer, const int buflen) { struct super_block *sb = dentry->d_sb; char *pos = tomoyo_get_dentry_path(dentry, buffer, buflen); if (IS_ERR(pos)) return pos; /* Convert from $PID to self if $PID is current thread. */ if (sb->s_magic == PROC_SUPER_MAGIC && *pos == '/') { char *ep; const pid_t pid = (pid_t) simple_strtoul(pos + 1, &ep, 10); struct pid_namespace *proc_pidns = proc_pid_ns(sb); if (*ep == '/' && pid && pid == task_tgid_nr_ns(current, proc_pidns)) { pos = ep - 5; if (pos < buffer) goto out; memmove(pos, "/self", 5); } goto prepend_filesystem_name; } /* Use filesystem name for unnamed devices. */ if (!MAJOR(sb->s_dev)) goto prepend_filesystem_name; { struct inode *inode = d_backing_inode(sb->s_root); /* * Use filesystem name if filesystem does not support rename() * operation. */ if (!inode->i_op->rename) goto prepend_filesystem_name; } /* Prepend device name. */ { char name[64]; int name_len; const dev_t dev = sb->s_dev; name[sizeof(name) - 1] = '\0'; snprintf(name, sizeof(name) - 1, "dev(%u,%u):", MAJOR(dev), MINOR(dev)); name_len = strlen(name); pos -= name_len; if (pos < buffer) goto out; memmove(pos, name, name_len); return pos; } /* Prepend filesystem name. */ prepend_filesystem_name: { const char *name = sb->s_type->name; const int name_len = strlen(name); pos -= name_len + 1; if (pos < buffer) goto out; memmove(pos, name, name_len); pos[name_len] = ':'; } return pos; out: return ERR_PTR(-ENOMEM); } /** * tomoyo_realpath_from_path - Returns realpath(3) of the given pathname but ignores chroot'ed root. * * @path: Pointer to "struct path". * * Returns the realpath of the given @path on success, NULL otherwise. * * If dentry is a directory, trailing '/' is appended. * Characters out of 0x20 < c < 0x7F range are converted to * \ooo style octal string. * Character \ is converted to \\ string. * * These functions use kzalloc(), so the caller must call kfree() * if these functions didn't return NULL. */ char *tomoyo_realpath_from_path(const struct path *path) { char *buf = NULL; char *name = NULL; unsigned int buf_len = PAGE_SIZE / 2; struct dentry *dentry = path->dentry; struct super_block *sb = dentry->d_sb; while (1) { char *pos; struct inode *inode; buf_len <<= 1; kfree(buf); buf = kmalloc(buf_len, GFP_NOFS); if (!buf) break; /* To make sure that pos is '\0' terminated. */ buf[buf_len - 1] = '\0'; /* For "pipe:[\$]" and "socket:[\$]". */ if (dentry->d_op && dentry->d_op->d_dname) { pos = dentry->d_op->d_dname(dentry, buf, buf_len - 1); goto encode; } inode = d_backing_inode(sb->s_root); /* * Get local name for filesystems without rename() operation */ if ((!inode->i_op->rename && !(sb->s_type->fs_flags & FS_REQUIRES_DEV))) pos = tomoyo_get_local_path(path->dentry, buf, buf_len - 1); /* Get absolute name for the rest. */ else { pos = tomoyo_get_absolute_path(path, buf, buf_len - 1); /* * Fall back to local name if absolute name is not * available. */ if (pos == ERR_PTR(-EINVAL)) pos = tomoyo_get_local_path(path->dentry, buf, buf_len - 1); } encode: if (IS_ERR(pos)) continue; name = tomoyo_encode(pos); break; } kfree(buf); if (!name) tomoyo_warn_oom(__func__); return name; } /** * tomoyo_realpath_nofollow - Get realpath of a pathname. * * @pathname: The pathname to solve. * * Returns the realpath of @pathname on success, NULL otherwise. */ char *tomoyo_realpath_nofollow(const char *pathname) { struct path path; if (pathname && kern_path(pathname, 0, &path) == 0) { char *buf = tomoyo_realpath_from_path(&path); path_put(&path); return buf; } return NULL; } |
13082 1269 2186 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * AppArmor security module * * This file contains AppArmor label definitions * * Copyright 2017 Canonical Ltd. */ #ifndef __AA_LABEL_H #define __AA_LABEL_H #include <linux/atomic.h> #include <linux/audit.h> #include <linux/rbtree.h> #include <linux/rcupdate.h> #include "apparmor.h" #include "lib.h" struct aa_ns; #define LOCAL_VEC_ENTRIES 8 #define DEFINE_VEC(T, V) \ struct aa_ ## T *(_ ## V ## _localtmp)[LOCAL_VEC_ENTRIES]; \ struct aa_ ## T **(V) #define vec_setup(T, V, N, GFP) \ ({ \ if ((N) <= LOCAL_VEC_ENTRIES) { \ typeof(N) i; \ (V) = (_ ## V ## _localtmp); \ for (i = 0; i < (N); i++) \ (V)[i] = NULL; \ } else \ (V) = kzalloc(sizeof(struct aa_ ## T *) * (N), (GFP)); \ (V) ? 0 : -ENOMEM; \ }) #define vec_cleanup(T, V, N) \ do { \ int i; \ for (i = 0; i < (N); i++) { \ if (!IS_ERR_OR_NULL((V)[i])) \ aa_put_ ## T((V)[i]); \ } \ if ((V) != _ ## V ## _localtmp) \ kfree(V); \ } while (0) #define vec_last(VEC, SIZE) ((VEC)[(SIZE) - 1]) #define vec_ns(VEC, SIZE) (vec_last((VEC), (SIZE))->ns) #define vec_labelset(VEC, SIZE) (&vec_ns((VEC), (SIZE))->labels) #define cleanup_domain_vec(V, L) cleanup_label_vec((V), (L)->size) struct aa_profile; #define VEC_FLAG_TERMINATE 1 int aa_vec_unique(struct aa_profile **vec, int n, int flags); struct aa_label *aa_vec_find_or_create_label(struct aa_profile **vec, int len, gfp_t gfp); #define aa_sort_and_merge_vec(N, V) \ aa_sort_and_merge_profiles((N), (struct aa_profile **)(V)) /* struct aa_labelset - set of labels for a namespace * * Labels are reference counted; aa_labelset does not contribute to label * reference counts. Once a label's last refcount is put it is removed from * the set. */ struct aa_labelset { rwlock_t lock; struct rb_root root; }; #define __labelset_for_each(LS, N) \ for ((N) = rb_first(&(LS)->root); (N); (N) = rb_next(N)) enum label_flags { FLAG_HAT = 1, /* profile is a hat */ FLAG_UNCONFINED = 2, /* label unconfined only if all */ FLAG_NULL = 4, /* profile is null learning profile */ FLAG_IX_ON_NAME_ERROR = 8, /* fallback to ix on name lookup fail */ FLAG_IMMUTIBLE = 0x10, /* don't allow changes/replacement */ FLAG_USER_DEFINED = 0x20, /* user based profile - lower privs */ FLAG_NO_LIST_REF = 0x40, /* list doesn't keep profile ref */ FLAG_NS_COUNT = 0x80, /* carries NS ref count */ FLAG_IN_TREE = 0x100, /* label is in tree */ FLAG_PROFILE = 0x200, /* label is a profile */ FLAG_EXPLICIT = 0x400, /* explicit static label */ FLAG_STALE = 0x800, /* replaced/removed */ FLAG_RENAMED = 0x1000, /* label has renaming in it */ FLAG_REVOKED = 0x2000, /* label has revocation in it */ FLAG_DEBUG1 = 0x4000, FLAG_DEBUG2 = 0x8000, /* These flags must correspond with PATH_flags */ /* TODO: add new path flags */ }; struct aa_label; struct aa_proxy { struct kref count; struct aa_label __rcu *label; }; struct label_it { int i, j; }; /* struct aa_label - lazy labeling struct * @count: ref count of active users * @node: rbtree position * @rcu: rcu callback struct * @proxy: is set to the label that replaced this label * @hname: text representation of the label (MAYBE_NULL) * @flags: stale and other flags - values may change under label set lock * @secid: secid that references this label * @size: number of entries in @ent[] * @ent: set of profiles for label, actual size determined by @size */ struct aa_label { struct kref count; struct rb_node node; struct rcu_head rcu; struct aa_proxy *proxy; __counted char *hname; long flags; u32 secid; int size; struct aa_profile *vec[]; }; #define last_error(E, FN) \ do { \ int __subE = (FN); \ if (__subE) \ (E) = __subE; \ } while (0) #define label_isprofile(X) ((X)->flags & FLAG_PROFILE) #define label_unconfined(X) ((X)->flags & FLAG_UNCONFINED) #define unconfined(X) label_unconfined(X) #define label_is_stale(X) ((X)->flags & FLAG_STALE) #define __label_make_stale(X) ((X)->flags |= FLAG_STALE) #define labels_ns(X) (vec_ns(&((X)->vec[0]), (X)->size)) #define labels_set(X) (&labels_ns(X)->labels) #define labels_view(X) labels_ns(X) #define labels_profile(X) ((X)->vec[(X)->size - 1]) int aa_label_next_confined(struct aa_label *l, int i); /* for each profile in a label */ #define label_for_each(I, L, P) \ for ((I).i = 0; ((P) = (L)->vec[(I).i]); ++((I).i)) /* assumes break/goto ended label_for_each */ #define label_for_each_cont(I, L, P) \ for (++((I).i); ((P) = (L)->vec[(I).i]); ++((I).i)) #define next_comb(I, L1, L2) \ do { \ (I).j++; \ if ((I).j >= (L2)->size) { \ (I).i++; \ (I).j = 0; \ } \ } while (0) /* for each combination of P1 in L1, and P2 in L2 */ #define label_for_each_comb(I, L1, L2, P1, P2) \ for ((I).i = (I).j = 0; \ ((P1) = (L1)->vec[(I).i]) && ((P2) = (L2)->vec[(I).j]); \ (I) = next_comb(I, L1, L2)) #define fn_for_each_comb(L1, L2, P1, P2, FN) \ ({ \ struct label_it i; \ int __E = 0; \ label_for_each_comb(i, (L1), (L2), (P1), (P2)) { \ last_error(__E, (FN)); \ } \ __E; \ }) /* for each profile that is enforcing confinement in a label */ #define label_for_each_confined(I, L, P) \ for ((I).i = aa_label_next_confined((L), 0); \ ((P) = (L)->vec[(I).i]); \ (I).i = aa_label_next_confined((L), (I).i + 1)) #define label_for_each_in_merge(I, A, B, P) \ for ((I).i = (I).j = 0; \ ((P) = aa_label_next_in_merge(&(I), (A), (B))); \ ) #define label_for_each_not_in_set(I, SET, SUB, P) \ for ((I).i = (I).j = 0; \ ((P) = __aa_label_next_not_in_set(&(I), (SET), (SUB))); \ ) #define next_in_ns(i, NS, L) \ ({ \ typeof(i) ___i = (i); \ while ((L)->vec[___i] && (L)->vec[___i]->ns != (NS)) \ (___i)++; \ (___i); \ }) #define label_for_each_in_ns(I, NS, L, P) \ for ((I).i = next_in_ns(0, (NS), (L)); \ ((P) = (L)->vec[(I).i]); \ (I).i = next_in_ns((I).i + 1, (NS), (L))) #define fn_for_each_in_ns(L, P, FN) \ ({ \ struct label_it __i; \ struct aa_ns *__ns = labels_ns(L); \ int __E = 0; \ label_for_each_in_ns(__i, __ns, (L), (P)) { \ last_error(__E, (FN)); \ } \ __E; \ }) #define fn_for_each_XXX(L, P, FN, ...) \ ({ \ struct label_it i; \ int __E = 0; \ label_for_each ## __VA_ARGS__(i, (L), (P)) { \ last_error(__E, (FN)); \ } \ __E; \ }) #define fn_for_each(L, P, FN) fn_for_each_XXX(L, P, FN) #define fn_for_each_confined(L, P, FN) fn_for_each_XXX(L, P, FN, _confined) #define fn_for_each2_XXX(L1, L2, P, FN, ...) \ ({ \ struct label_it i; \ int __E = 0; \ label_for_each ## __VA_ARGS__(i, (L1), (L2), (P)) { \ last_error(__E, (FN)); \ } \ __E; \ }) #define fn_for_each_in_merge(L1, L2, P, FN) \ fn_for_each2_XXX((L1), (L2), P, FN, _in_merge) #define fn_for_each_not_in_set(L1, L2, P, FN) \ fn_for_each2_XXX((L1), (L2), P, FN, _not_in_set) #define LABEL_MEDIATES(L, C) \ ({ \ struct aa_profile *profile; \ struct label_it i; \ int ret = 0; \ label_for_each(i, (L), profile) { \ if (RULE_MEDIATES(&profile->rules, (C))) { \ ret = 1; \ break; \ } \ } \ ret; \ }) void aa_labelset_destroy(struct aa_labelset *ls); void aa_labelset_init(struct aa_labelset *ls); void __aa_labelset_update_subtree(struct aa_ns *ns); void aa_label_destroy(struct aa_label *label); void aa_label_free(struct aa_label *label); void aa_label_kref(struct kref *kref); bool aa_label_init(struct aa_label *label, int size, gfp_t gfp); struct aa_label *aa_label_alloc(int size, struct aa_proxy *proxy, gfp_t gfp); bool aa_label_is_subset(struct aa_label *set, struct aa_label *sub); bool aa_label_is_unconfined_subset(struct aa_label *set, struct aa_label *sub); struct aa_profile *__aa_label_next_not_in_set(struct label_it *I, struct aa_label *set, struct aa_label *sub); bool aa_label_remove(struct aa_label *label); struct aa_label *aa_label_insert(struct aa_labelset *ls, struct aa_label *l); bool aa_label_replace(struct aa_label *old, struct aa_label *new); bool aa_label_make_newest(struct aa_labelset *ls, struct aa_label *old, struct aa_label *new); struct aa_label *aa_label_find(struct aa_label *l); struct aa_profile *aa_label_next_in_merge(struct label_it *I, struct aa_label *a, struct aa_label *b); struct aa_label *aa_label_find_merge(struct aa_label *a, struct aa_label *b); struct aa_label *aa_label_merge(struct aa_label *a, struct aa_label *b, gfp_t gfp); bool aa_update_label_name(struct aa_ns *ns, struct aa_label *label, gfp_t gfp); #define FLAGS_NONE 0 #define FLAG_SHOW_MODE 1 #define FLAG_VIEW_SUBNS 2 #define FLAG_HIDDEN_UNCONFINED 4 #define FLAG_ABS_ROOT 8 int aa_label_snxprint(char *str, size_t size, struct aa_ns *view, struct aa_label *label, int flags); int aa_label_asxprint(char **strp, struct aa_ns *ns, struct aa_label *label, int flags, gfp_t gfp); int aa_label_acntsxprint(char __counted **strp, struct aa_ns *ns, struct aa_label *label, int flags, gfp_t gfp); void aa_label_xaudit(struct audit_buffer *ab, struct aa_ns *ns, struct aa_label *label, int flags, gfp_t gfp); void aa_label_seq_xprint(struct seq_file *f, struct aa_ns *ns, struct aa_label *label, int flags, gfp_t gfp); void aa_label_xprintk(struct aa_ns *ns, struct aa_label *label, int flags, gfp_t gfp); void aa_label_audit(struct audit_buffer *ab, struct aa_label *label, gfp_t gfp); void aa_label_seq_print(struct seq_file *f, struct aa_label *label, gfp_t gfp); void aa_label_printk(struct aa_label *label, gfp_t gfp); struct aa_label *aa_label_strn_parse(struct aa_label *base, const char *str, size_t n, gfp_t gfp, bool create, bool force_stack); struct aa_label *aa_label_parse(struct aa_label *base, const char *str, gfp_t gfp, bool create, bool force_stack); static inline const char *aa_label_strn_split(const char *str, int n) { const char *pos; aa_state_t state; state = aa_dfa_matchn_until(stacksplitdfa, DFA_START, str, n, &pos); if (!ACCEPT_TABLE(stacksplitdfa)[state]) return NULL; return pos - 3; } static inline const char *aa_label_str_split(const char *str) { const char *pos; aa_state_t state; state = aa_dfa_match_until(stacksplitdfa, DFA_START, str, &pos); if (!ACCEPT_TABLE(stacksplitdfa)[state]) return NULL; return pos - 3; } struct aa_perms; struct aa_ruleset; int aa_label_match(struct aa_profile *profile, struct aa_ruleset *rules, struct aa_label *label, aa_state_t state, bool subns, u32 request, struct aa_perms *perms); /** * __aa_get_label - get a reference count to uncounted label reference * @l: reference to get a count on * * Returns: pointer to reference OR NULL if race is lost and reference is * being repeated. * Requires: lock held, and the return code MUST be checked */ static inline struct aa_label *__aa_get_label(struct aa_label *l) { if (l && kref_get_unless_zero(&l->count)) return l; return NULL; } static inline struct aa_label *aa_get_label(struct aa_label *l) { if (l) kref_get(&(l->count)); return l; } /** * aa_get_label_rcu - increment refcount on a label that can be replaced * @l: pointer to label that can be replaced (NOT NULL) * * Returns: pointer to a refcounted label. * else NULL if no label */ static inline struct aa_label *aa_get_label_rcu(struct aa_label __rcu **l) { struct aa_label *c; rcu_read_lock(); do { c = rcu_dereference(*l); } while (c && !kref_get_unless_zero(&c->count)); rcu_read_unlock(); return c; } /** * aa_get_newest_label - find the newest version of @l * @l: the label to check for newer versions of * * Returns: refcounted newest version of @l taking into account * replacement, renames and removals * return @l. */ static inline struct aa_label *aa_get_newest_label(struct aa_label *l) { if (!l) return NULL; if (label_is_stale(l)) { struct aa_label *tmp; AA_BUG(!l->proxy); AA_BUG(!l->proxy->label); /* BUG: only way this can happen is @l ref count and its * replacement count have gone to 0 and are on their way * to destruction. ie. we have a refcounting error */ tmp = aa_get_label_rcu(&l->proxy->label); AA_BUG(!tmp); return tmp; } return aa_get_label(l); } static inline void aa_put_label(struct aa_label *l) { if (l) kref_put(&l->count, aa_label_kref); } struct aa_proxy *aa_alloc_proxy(struct aa_label *l, gfp_t gfp); void aa_proxy_kref(struct kref *kref); static inline struct aa_proxy *aa_get_proxy(struct aa_proxy *proxy) { if (proxy) kref_get(&(proxy->count)); return proxy; } static inline void aa_put_proxy(struct aa_proxy *proxy) { if (proxy) kref_put(&proxy->count, aa_proxy_kref); } void __aa_proxy_redirect(struct aa_label *orig, struct aa_label *new); #endif /* __AA_LABEL_H */ |
7 7 7 3 2 2 2 2 5 4 4 5 5 1 1 1 1 1 1 5 5 5 5 5 1 4 1 1 1 1 1 1 1 1 1 1 1 11 5 5 5 5 5 118 22 50 12 12 43 93 4 4 93 21 20 20 18 20 20 5 5 4 4 4 4 4 4 4 4 4 4 4 4 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Userspace interface * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> */ #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/netpoll.h> #include <linux/ethtool.h> #include <linux/if_arp.h> #include <linux/module.h> #include <linux/init.h> #include <linux/rtnetlink.h> #include <linux/if_ether.h> #include <linux/slab.h> #include <net/dsa.h> #include <net/sock.h> #include <linux/if_vlan.h> #include <net/switchdev.h> #include <net/net_namespace.h> #include "br_private.h" /* * Determine initial path cost based on speed. * using recommendations from 802.1d standard * * Since driver might sleep need to not be holding any locks. */ static int port_cost(struct net_device *dev) { struct ethtool_link_ksettings ecmd; if (!__ethtool_get_link_ksettings(dev, &ecmd)) { switch (ecmd.base.speed) { case SPEED_10000: return 2; case SPEED_5000: return 3; case SPEED_2500: return 4; case SPEED_1000: return 5; case SPEED_100: return 19; case SPEED_10: return 100; case SPEED_UNKNOWN: return 100; default: if (ecmd.base.speed > SPEED_10000) return 1; } } /* Old silly heuristics based on name */ if (!strncmp(dev->name, "lec", 3)) return 7; if (!strncmp(dev->name, "plip", 4)) return 2500; return 100; /* assume old 10Mbps */ } /* Check for port carrier transitions. */ void br_port_carrier_check(struct net_bridge_port *p, bool *notified) { struct net_device *dev = p->dev; struct net_bridge *br = p->br; if (!(p->flags & BR_ADMIN_COST) && netif_running(dev) && netif_oper_up(dev)) p->path_cost = port_cost(dev); *notified = false; if (!netif_running(br->dev)) return; spin_lock_bh(&br->lock); if (netif_running(dev) && netif_oper_up(dev)) { if (p->state == BR_STATE_DISABLED) { br_stp_enable_port(p); *notified = true; } } else { if (p->state != BR_STATE_DISABLED) { br_stp_disable_port(p); *notified = true; } } spin_unlock_bh(&br->lock); } static void br_port_set_promisc(struct net_bridge_port *p) { int err = 0; if (br_promisc_port(p)) return; err = dev_set_promiscuity(p->dev, 1); if (err) return; br_fdb_unsync_static(p->br, p); p->flags |= BR_PROMISC; } static void br_port_clear_promisc(struct net_bridge_port *p) { int err; /* Check if the port is already non-promisc or if it doesn't * support UNICAST filtering. Without unicast filtering support * we'll end up re-enabling promisc mode anyway, so just check for * it here. */ if (!br_promisc_port(p) || !(p->dev->priv_flags & IFF_UNICAST_FLT)) return; /* Since we'll be clearing the promisc mode, program the port * first so that we don't have interruption in traffic. */ err = br_fdb_sync_static(p->br, p); if (err) return; dev_set_promiscuity(p->dev, -1); p->flags &= ~BR_PROMISC; } /* When a port is added or removed or when certain port flags * change, this function is called to automatically manage * promiscuity setting of all the bridge ports. We are always called * under RTNL so can skip using rcu primitives. */ void br_manage_promisc(struct net_bridge *br) { struct net_bridge_port *p; bool set_all = false; /* If vlan filtering is disabled or bridge interface is placed * into promiscuous mode, place all ports in promiscuous mode. */ if ((br->dev->flags & IFF_PROMISC) || !br_vlan_enabled(br->dev)) set_all = true; list_for_each_entry(p, &br->port_list, list) { if (set_all) { br_port_set_promisc(p); } else { /* If the number of auto-ports is <= 1, then all other * ports will have their output configuration * statically specified through fdbs. Since ingress * on the auto-port becomes forwarding/egress to other * ports and egress configuration is statically known, * we can say that ingress configuration of the * auto-port is also statically known. * This lets us disable promiscuous mode and write * this config to hw. */ if ((p->dev->priv_flags & IFF_UNICAST_FLT) && (br->auto_cnt == 0 || (br->auto_cnt == 1 && br_auto_port(p)))) br_port_clear_promisc(p); else br_port_set_promisc(p); } } } int nbp_backup_change(struct net_bridge_port *p, struct net_device *backup_dev) { struct net_bridge_port *old_backup = rtnl_dereference(p->backup_port); struct net_bridge_port *backup_p = NULL; ASSERT_RTNL(); if (backup_dev) { if (!netif_is_bridge_port(backup_dev)) return -ENOENT; backup_p = br_port_get_rtnl(backup_dev); if (backup_p->br != p->br) return -EINVAL; } if (p == backup_p) return -EINVAL; if (old_backup == backup_p) return 0; /* if the backup link is already set, clear it */ if (old_backup) old_backup->backup_redirected_cnt--; if (backup_p) backup_p->backup_redirected_cnt++; rcu_assign_pointer(p->backup_port, backup_p); return 0; } static void nbp_backup_clear(struct net_bridge_port *p) { nbp_backup_change(p, NULL); if (p->backup_redirected_cnt) { struct net_bridge_port *cur_p; list_for_each_entry(cur_p, &p->br->port_list, list) { struct net_bridge_port *backup_p; backup_p = rtnl_dereference(cur_p->backup_port); if (backup_p == p) nbp_backup_change(cur_p, NULL); } } WARN_ON(rcu_access_pointer(p->backup_port) || p->backup_redirected_cnt); } static void nbp_update_port_count(struct net_bridge *br) { struct net_bridge_port *p; u32 cnt = 0; list_for_each_entry(p, &br->port_list, list) { if (br_auto_port(p)) cnt++; } if (br->auto_cnt != cnt) { br->auto_cnt = cnt; br_manage_promisc(br); } } static void nbp_delete_promisc(struct net_bridge_port *p) { /* If port is currently promiscuous, unset promiscuity. * Otherwise, it is a static port so remove all addresses * from it. */ dev_set_allmulti(p->dev, -1); if (br_promisc_port(p)) dev_set_promiscuity(p->dev, -1); else br_fdb_unsync_static(p->br, p); } static void release_nbp(struct kobject *kobj) { struct net_bridge_port *p = container_of(kobj, struct net_bridge_port, kobj); kfree(p); } static void brport_get_ownership(const struct kobject *kobj, kuid_t *uid, kgid_t *gid) { struct net_bridge_port *p = kobj_to_brport(kobj); net_ns_get_ownership(dev_net(p->dev), uid, gid); } static const struct kobj_type brport_ktype = { #ifdef CONFIG_SYSFS .sysfs_ops = &brport_sysfs_ops, #endif .release = release_nbp, .get_ownership = brport_get_ownership, }; static void destroy_nbp(struct net_bridge_port *p) { struct net_device *dev = p->dev; p->br = NULL; p->dev = NULL; netdev_put(dev, &p->dev_tracker); kobject_put(&p->kobj); } static void destroy_nbp_rcu(struct rcu_head *head) { struct net_bridge_port *p = container_of(head, struct net_bridge_port, rcu); destroy_nbp(p); } static unsigned get_max_headroom(struct net_bridge *br) { unsigned max_headroom = 0; struct net_bridge_port *p; list_for_each_entry(p, &br->port_list, list) { unsigned dev_headroom = netdev_get_fwd_headroom(p->dev); if (dev_headroom > max_headroom) max_headroom = dev_headroom; } return max_headroom; } static void update_headroom(struct net_bridge *br, int new_hr) { struct net_bridge_port *p; list_for_each_entry(p, &br->port_list, list) netdev_set_rx_headroom(p->dev, new_hr); br->dev->needed_headroom = new_hr; } /* Delete port(interface) from bridge is done in two steps. * via RCU. First step, marks device as down. That deletes * all the timers and stops new packets from flowing through. * * Final cleanup doesn't occur until after all CPU's finished * processing packets. * * Protected from multiple admin operations by RTNL mutex */ static void del_nbp(struct net_bridge_port *p) { struct net_bridge *br = p->br; struct net_device *dev = p->dev; sysfs_remove_link(br->ifobj, p->dev->name); nbp_delete_promisc(p); spin_lock_bh(&br->lock); br_stp_disable_port(p); spin_unlock_bh(&br->lock); br_mrp_port_del(br, p); br_cfm_port_del(br, p); br_ifinfo_notify(RTM_DELLINK, NULL, p); list_del_rcu(&p->list); if (netdev_get_fwd_headroom(dev) == br->dev->needed_headroom) update_headroom(br, get_max_headroom(br)); netdev_reset_rx_headroom(dev); nbp_vlan_flush(p); br_fdb_delete_by_port(br, p, 0, 1); switchdev_deferred_process(); nbp_backup_clear(p); nbp_update_port_count(br); netdev_upper_dev_unlink(dev, br->dev); dev->priv_flags &= ~IFF_BRIDGE_PORT; netdev_rx_handler_unregister(dev); br_multicast_del_port(p); kobject_uevent(&p->kobj, KOBJ_REMOVE); kobject_del(&p->kobj); br_netpoll_disable(p); call_rcu(&p->rcu, destroy_nbp_rcu); } /* Delete bridge device */ void br_dev_delete(struct net_device *dev, struct list_head *head) { struct net_bridge *br = netdev_priv(dev); struct net_bridge_port *p, *n; list_for_each_entry_safe(p, n, &br->port_list, list) { del_nbp(p); } br_recalculate_neigh_suppress_enabled(br); br_fdb_delete_by_port(br, NULL, 0, 1); cancel_delayed_work_sync(&br->gc_work); br_sysfs_delbr(br->dev); unregister_netdevice_queue(br->dev, head); } /* find an available port number */ static int find_portno(struct net_bridge *br) { int index; struct net_bridge_port *p; unsigned long *inuse; inuse = bitmap_zalloc(BR_MAX_PORTS, GFP_KERNEL); if (!inuse) return -ENOMEM; __set_bit(0, inuse); /* zero is reserved */ list_for_each_entry(p, &br->port_list, list) __set_bit(p->port_no, inuse); index = find_first_zero_bit(inuse, BR_MAX_PORTS); bitmap_free(inuse); return (index >= BR_MAX_PORTS) ? -EXFULL : index; } /* called with RTNL but without bridge lock */ static struct net_bridge_port *new_nbp(struct net_bridge *br, struct net_device *dev) { struct net_bridge_port *p; int index, err; index = find_portno(br); if (index < 0) return ERR_PTR(index); p = kzalloc(sizeof(*p), GFP_KERNEL); if (p == NULL) return ERR_PTR(-ENOMEM); p->br = br; netdev_hold(dev, &p->dev_tracker, GFP_KERNEL); p->dev = dev; p->path_cost = port_cost(dev); p->priority = 0x8000 >> BR_PORT_BITS; p->port_no = index; p->flags = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD | BR_BCAST_FLOOD; br_init_port(p); br_set_state(p, BR_STATE_DISABLED); br_stp_port_timer_init(p); err = br_multicast_add_port(p); if (err) { netdev_put(dev, &p->dev_tracker); kfree(p); p = ERR_PTR(err); } return p; } int br_add_bridge(struct net *net, const char *name) { struct net_device *dev; int res; dev = alloc_netdev(sizeof(struct net_bridge), name, NET_NAME_UNKNOWN, br_dev_setup); if (!dev) return -ENOMEM; dev_net_set(dev, net); dev->rtnl_link_ops = &br_link_ops; res = register_netdevice(dev); if (res) free_netdev(dev); return res; } int br_del_bridge(struct net *net, const char *name) { struct net_device *dev; int ret = 0; dev = __dev_get_by_name(net, name); if (dev == NULL) ret = -ENXIO; /* Could not find device */ else if (!netif_is_bridge_master(dev)) { /* Attempt to delete non bridge device! */ ret = -EPERM; } else if (dev->flags & IFF_UP) { /* Not shutdown yet. */ ret = -EBUSY; } else br_dev_delete(dev, NULL); return ret; } /* MTU of the bridge pseudo-device: ETH_DATA_LEN or the minimum of the ports */ static int br_mtu_min(const struct net_bridge *br) { const struct net_bridge_port *p; int ret_mtu = 0; list_for_each_entry(p, &br->port_list, list) if (!ret_mtu || ret_mtu > p->dev->mtu) ret_mtu = p->dev->mtu; return ret_mtu ? ret_mtu : ETH_DATA_LEN; } void br_mtu_auto_adjust(struct net_bridge *br) { ASSERT_RTNL(); /* if the bridge MTU was manually configured don't mess with it */ if (br_opt_get(br, BROPT_MTU_SET_BY_USER)) return; /* change to the minimum MTU and clear the flag which was set by * the bridge ndo_change_mtu callback */ dev_set_mtu(br->dev, br_mtu_min(br)); br_opt_toggle(br, BROPT_MTU_SET_BY_USER, false); } static void br_set_gso_limits(struct net_bridge *br) { unsigned int tso_max_size = TSO_MAX_SIZE; const struct net_bridge_port *p; u16 tso_max_segs = TSO_MAX_SEGS; list_for_each_entry(p, &br->port_list, list) { tso_max_size = min(tso_max_size, p->dev->tso_max_size); tso_max_segs = min(tso_max_segs, p->dev->tso_max_segs); } netif_set_tso_max_size(br->dev, tso_max_size); netif_set_tso_max_segs(br->dev, tso_max_segs); } /* * Recomputes features using slave's features */ netdev_features_t br_features_recompute(struct net_bridge *br, netdev_features_t features) { struct net_bridge_port *p; netdev_features_t mask; if (list_empty(&br->port_list)) return features; mask = features; features &= ~NETIF_F_ONE_FOR_ALL; list_for_each_entry(p, &br->port_list, list) { features = netdev_increment_features(features, p->dev->features, mask); } features = netdev_add_tso_features(features, mask); return features; } /* called with RTNL */ int br_add_if(struct net_bridge *br, struct net_device *dev, struct netlink_ext_ack *extack) { struct net_bridge_port *p; int err = 0; unsigned br_hr, dev_hr; bool changed_addr, fdb_synced = false; /* Don't allow bridging non-ethernet like devices. */ if ((dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr)) return -EINVAL; /* No bridging of bridges */ if (dev->netdev_ops->ndo_start_xmit == br_dev_xmit) { NL_SET_ERR_MSG(extack, "Can not enslave a bridge to a bridge"); return -ELOOP; } /* Device has master upper dev */ if (netdev_master_upper_dev_get(dev)) return -EBUSY; /* No bridging devices that dislike that (e.g. wireless) */ if (dev->priv_flags & IFF_DONT_BRIDGE) { NL_SET_ERR_MSG(extack, "Device does not allow enslaving to a bridge"); return -EOPNOTSUPP; } p = new_nbp(br, dev); if (IS_ERR(p)) return PTR_ERR(p); call_netdevice_notifiers(NETDEV_JOIN, dev); err = dev_set_allmulti(dev, 1); if (err) { br_multicast_del_port(p); netdev_put(dev, &p->dev_tracker); kfree(p); /* kobject not yet init'd, manually free */ goto err1; } err = kobject_init_and_add(&p->kobj, &brport_ktype, &(dev->dev.kobj), SYSFS_BRIDGE_PORT_ATTR); if (err) goto err2; err = br_sysfs_addif(p); if (err) goto err2; err = br_netpoll_enable(p); if (err) goto err3; err = netdev_rx_handler_register(dev, br_get_rx_handler(dev), p); if (err) goto err4; dev->priv_flags |= IFF_BRIDGE_PORT; err = netdev_master_upper_dev_link(dev, br->dev, NULL, NULL, extack); if (err) goto err5; dev_disable_lro(dev); list_add_rcu(&p->list, &br->port_list); nbp_update_port_count(br); if (!br_promisc_port(p) && (p->dev->priv_flags & IFF_UNICAST_FLT)) { /* When updating the port count we also update all ports' * promiscuous mode. * A port leaving promiscuous mode normally gets the bridge's * fdb synced to the unicast filter (if supported), however, * `br_port_clear_promisc` does not distinguish between * non-promiscuous ports and *new* ports, so we need to * sync explicitly here. */ fdb_synced = br_fdb_sync_static(br, p) == 0; if (!fdb_synced) netdev_err(dev, "failed to sync bridge static fdb addresses to this port\n"); } netdev_update_features(br->dev); br_hr = br->dev->needed_headroom; dev_hr = netdev_get_fwd_headroom(dev); if (br_hr < dev_hr) update_headroom(br, dev_hr); else netdev_set_rx_headroom(dev, br_hr); if (br_fdb_add_local(br, p, dev->dev_addr, 0)) netdev_err(dev, "failed insert local address bridge forwarding table\n"); if (br->dev->addr_assign_type != NET_ADDR_SET) { /* Ask for permission to use this MAC address now, even if we * don't end up choosing it below. */ err = dev_pre_changeaddr_notify(br->dev, dev->dev_addr, extack); if (err) goto err6; } err = nbp_vlan_init(p, extack); if (err) { netdev_err(dev, "failed to initialize vlan filtering on this port\n"); goto err6; } spin_lock_bh(&br->lock); changed_addr = br_stp_recalculate_bridge_id(br); if (netif_running(dev) && netif_oper_up(dev) && (br->dev->flags & IFF_UP)) br_stp_enable_port(p); spin_unlock_bh(&br->lock); br_ifinfo_notify(RTM_NEWLINK, NULL, p); if (changed_addr) call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev); br_mtu_auto_adjust(br); br_set_gso_limits(br); kobject_uevent(&p->kobj, KOBJ_ADD); return 0; err6: if (fdb_synced) br_fdb_unsync_static(br, p); list_del_rcu(&p->list); br_fdb_delete_by_port(br, p, 0, 1); nbp_update_port_count(br); netdev_upper_dev_unlink(dev, br->dev); err5: dev->priv_flags &= ~IFF_BRIDGE_PORT; netdev_rx_handler_unregister(dev); err4: br_netpoll_disable(p); err3: sysfs_remove_link(br->ifobj, p->dev->name); err2: br_multicast_del_port(p); netdev_put(dev, &p->dev_tracker); kobject_put(&p->kobj); dev_set_allmulti(dev, -1); err1: return err; } /* called with RTNL */ int br_del_if(struct net_bridge *br, struct net_device *dev) { struct net_bridge_port *p; bool changed_addr; p = br_port_get_rtnl(dev); if (!p || p->br != br) return -EINVAL; /* Since more than one interface can be attached to a bridge, * there still maybe an alternate path for netconsole to use; * therefore there is no reason for a NETDEV_RELEASE event. */ del_nbp(p); br_mtu_auto_adjust(br); br_set_gso_limits(br); spin_lock_bh(&br->lock); changed_addr = br_stp_recalculate_bridge_id(br); spin_unlock_bh(&br->lock); if (changed_addr) call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev); netdev_update_features(br->dev); return 0; } void br_port_flags_change(struct net_bridge_port *p, unsigned long mask) { struct net_bridge *br = p->br; if (mask & BR_AUTO_MASK) nbp_update_port_count(br); if (mask & (BR_NEIGH_SUPPRESS | BR_NEIGH_VLAN_SUPPRESS)) br_recalculate_neigh_suppress_enabled(br); } bool br_port_flag_is_set(const struct net_device *dev, unsigned long flag) { struct net_bridge_port *p; p = br_port_get_rtnl_rcu(dev); if (!p) return false; return p->flags & flag; } EXPORT_SYMBOL_GPL(br_port_flag_is_set); |
1 184 140 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Copyright (c) 2014 Mahesh Bandewar <maheshb@google.com> */ #ifndef __IPVLAN_H #define __IPVLAN_H #include <linux/kernel.h> #include <linux/types.h> #include <linux/module.h> #include <linux/init.h> #include <linux/rculist.h> #include <linux/notifier.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/if_arp.h> #include <linux/if_link.h> #include <linux/if_vlan.h> #include <linux/ip.h> #include <linux/inetdevice.h> #include <linux/netfilter.h> #include <net/ip.h> #include <net/ip6_route.h> #include <net/netns/generic.h> #include <net/rtnetlink.h> #include <net/route.h> #include <net/addrconf.h> #include <net/l3mdev.h> #define IPVLAN_DRV "ipvlan" #define IPV_DRV_VER "0.1" #define IPVLAN_HASH_SIZE (1 << BITS_PER_BYTE) #define IPVLAN_HASH_MASK (IPVLAN_HASH_SIZE - 1) #define IPVLAN_MAC_FILTER_BITS 8 #define IPVLAN_MAC_FILTER_SIZE (1 << IPVLAN_MAC_FILTER_BITS) #define IPVLAN_MAC_FILTER_MASK (IPVLAN_MAC_FILTER_SIZE - 1) #define IPVLAN_QBACKLOG_LIMIT 1000 typedef enum { IPVL_IPV6 = 0, IPVL_ICMPV6, IPVL_IPV4, IPVL_ARP, } ipvl_hdr_type; struct ipvl_pcpu_stats { u64_stats_t rx_pkts; u64_stats_t rx_bytes; u64_stats_t rx_mcast; u64_stats_t tx_pkts; u64_stats_t tx_bytes; struct u64_stats_sync syncp; u32 rx_errs; u32 tx_drps; }; struct ipvl_port; struct ipvl_dev { struct net_device *dev; struct list_head pnode; struct ipvl_port *port; struct net_device *phy_dev; struct list_head addrs; struct ipvl_pcpu_stats __percpu *pcpu_stats; DECLARE_BITMAP(mac_filters, IPVLAN_MAC_FILTER_SIZE); netdev_features_t sfeatures; u32 msg_enable; spinlock_t addrs_lock; }; struct ipvl_addr { struct ipvl_dev *master; /* Back pointer to master */ union { struct in6_addr ip6; /* IPv6 address on logical interface */ struct in_addr ip4; /* IPv4 address on logical interface */ } ipu; #define ip6addr ipu.ip6 #define ip4addr ipu.ip4 struct hlist_node hlnode; /* Hash-table linkage */ struct list_head anode; /* logical-interface linkage */ ipvl_hdr_type atype; struct rcu_head rcu; }; struct ipvl_port { struct net_device *dev; possible_net_t pnet; struct hlist_head hlhead[IPVLAN_HASH_SIZE]; struct list_head ipvlans; u16 mode; u16 flags; u16 dev_id_start; struct work_struct wq; struct sk_buff_head backlog; int count; struct ida ida; netdevice_tracker dev_tracker; }; struct ipvl_skb_cb { bool tx_pkt; }; #define IPVL_SKB_CB(_skb) ((struct ipvl_skb_cb *)&((_skb)->cb[0])) static inline struct ipvl_port *ipvlan_port_get_rcu(const struct net_device *d) { return rcu_dereference(d->rx_handler_data); } static inline struct ipvl_port *ipvlan_port_get_rcu_bh(const struct net_device *d) { return rcu_dereference_bh(d->rx_handler_data); } static inline struct ipvl_port *ipvlan_port_get_rtnl(const struct net_device *d) { return rtnl_dereference(d->rx_handler_data); } static inline bool ipvlan_is_private(const struct ipvl_port *port) { return !!(port->flags & IPVLAN_F_PRIVATE); } static inline void ipvlan_mark_private(struct ipvl_port *port) { port->flags |= IPVLAN_F_PRIVATE; } static inline void ipvlan_clear_private(struct ipvl_port *port) { port->flags &= ~IPVLAN_F_PRIVATE; } static inline bool ipvlan_is_vepa(const struct ipvl_port *port) { return !!(port->flags & IPVLAN_F_VEPA); } static inline void ipvlan_mark_vepa(struct ipvl_port *port) { port->flags |= IPVLAN_F_VEPA; } static inline void ipvlan_clear_vepa(struct ipvl_port *port) { port->flags &= ~IPVLAN_F_VEPA; } void ipvlan_init_secret(void); unsigned int ipvlan_mac_hash(const unsigned char *addr); rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb); void ipvlan_process_multicast(struct work_struct *work); int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev); void ipvlan_ht_addr_add(struct ipvl_dev *ipvlan, struct ipvl_addr *addr); struct ipvl_addr *ipvlan_find_addr(const struct ipvl_dev *ipvlan, const void *iaddr, bool is_v6); bool ipvlan_addr_busy(struct ipvl_port *port, void *iaddr, bool is_v6); void ipvlan_ht_addr_del(struct ipvl_addr *addr); struct ipvl_addr *ipvlan_addr_lookup(struct ipvl_port *port, void *lyr3h, int addr_type, bool use_dest); void *ipvlan_get_L3_hdr(struct ipvl_port *port, struct sk_buff *skb, int *type); void ipvlan_count_rx(const struct ipvl_dev *ipvlan, unsigned int len, bool success, bool mcast); int ipvlan_link_new(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack); void ipvlan_link_delete(struct net_device *dev, struct list_head *head); void ipvlan_link_setup(struct net_device *dev); int ipvlan_link_register(struct rtnl_link_ops *ops); #ifdef CONFIG_IPVLAN_L3S int ipvlan_l3s_register(struct ipvl_port *port); void ipvlan_l3s_unregister(struct ipvl_port *port); void ipvlan_migrate_l3s_hook(struct net *oldnet, struct net *newnet); int ipvlan_l3s_init(void); void ipvlan_l3s_cleanup(void); #else static inline int ipvlan_l3s_register(struct ipvl_port *port) { return -ENOTSUPP; } static inline void ipvlan_l3s_unregister(struct ipvl_port *port) { } static inline void ipvlan_migrate_l3s_hook(struct net *oldnet, struct net *newnet) { } static inline int ipvlan_l3s_init(void) { return 0; } static inline void ipvlan_l3s_cleanup(void) { } #endif /* CONFIG_IPVLAN_L3S */ static inline bool netif_is_ipvlan_port(const struct net_device *dev) { return rcu_access_pointer(dev->rx_handler) == ipvlan_handle_frame; } #endif /* __IPVLAN_H */ |
6 16 20 24 38 11 3 10 10 10 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 | /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ #ifndef _LINUX_SKMSG_H #define _LINUX_SKMSG_H #include <linux/bpf.h> #include <linux/filter.h> #include <linux/scatterlist.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/tcp.h> #include <net/strparser.h> #define MAX_MSG_FRAGS MAX_SKB_FRAGS #define NR_MSG_FRAG_IDS (MAX_MSG_FRAGS + 1) enum __sk_action { __SK_DROP = 0, __SK_PASS, __SK_REDIRECT, __SK_NONE, }; struct sk_msg_sg { u32 start; u32 curr; u32 end; u32 size; u32 copybreak; DECLARE_BITMAP(copy, MAX_MSG_FRAGS + 2); /* The extra two elements: * 1) used for chaining the front and sections when the list becomes * partitioned (e.g. end < start). The crypto APIs require the * chaining; * 2) to chain tailer SG entries after the message. */ struct scatterlist data[MAX_MSG_FRAGS + 2]; }; /* UAPI in filter.c depends on struct sk_msg_sg being first element. */ struct sk_msg { struct sk_msg_sg sg; void *data; void *data_end; u32 apply_bytes; u32 cork_bytes; u32 flags; struct sk_buff *skb; struct sock *sk_redir; struct sock *sk; struct list_head list; }; struct sk_psock_progs { struct bpf_prog *msg_parser; struct bpf_prog *stream_parser; struct bpf_prog *stream_verdict; struct bpf_prog *skb_verdict; }; enum sk_psock_state_bits { SK_PSOCK_TX_ENABLED, SK_PSOCK_RX_STRP_ENABLED, }; struct sk_psock_link { struct list_head list; struct bpf_map *map; void *link_raw; }; struct sk_psock_work_state { u32 len; u32 off; }; struct sk_psock { struct sock *sk; struct sock *sk_redir; u32 apply_bytes; u32 cork_bytes; u32 eval; bool redir_ingress; /* undefined if sk_redir is null */ struct sk_msg *cork; struct sk_psock_progs progs; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) struct strparser strp; #endif struct sk_buff_head ingress_skb; struct list_head ingress_msg; spinlock_t ingress_lock; unsigned long state; struct list_head link; spinlock_t link_lock; refcount_t refcnt; void (*saved_unhash)(struct sock *sk); void (*saved_destroy)(struct sock *sk); void (*saved_close)(struct sock *sk, long timeout); void (*saved_write_space)(struct sock *sk); void (*saved_data_ready)(struct sock *sk); int (*psock_update_sk_prot)(struct sock *sk, struct sk_psock *psock, bool restore); struct proto *sk_proto; struct mutex work_mutex; struct sk_psock_work_state work_state; struct delayed_work work; struct rcu_work rwork; }; int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, int elem_first_coalesce); int sk_msg_clone(struct sock *sk, struct sk_msg *dst, struct sk_msg *src, u32 off, u32 len); void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len); int sk_msg_free(struct sock *sk, struct sk_msg *msg); int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg); void sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes); void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg, u32 bytes); void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes); void sk_msg_return_zero(struct sock *sk, struct sk_msg *msg, int bytes); int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes); int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes); int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags); bool sk_msg_is_readable(struct sock *sk); static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes) { WARN_ON(i == msg->sg.end && bytes); } static inline void sk_msg_apply_bytes(struct sk_psock *psock, u32 bytes) { if (psock->apply_bytes) { if (psock->apply_bytes < bytes) psock->apply_bytes = 0; else psock->apply_bytes -= bytes; } } static inline u32 sk_msg_iter_dist(u32 start, u32 end) { return end >= start ? end - start : end + (NR_MSG_FRAG_IDS - start); } #define sk_msg_iter_var_prev(var) \ do { \ if (var == 0) \ var = NR_MSG_FRAG_IDS - 1; \ else \ var--; \ } while (0) #define sk_msg_iter_var_next(var) \ do { \ var++; \ if (var == NR_MSG_FRAG_IDS) \ var = 0; \ } while (0) #define sk_msg_iter_prev(msg, which) \ sk_msg_iter_var_prev(msg->sg.which) #define sk_msg_iter_next(msg, which) \ sk_msg_iter_var_next(msg->sg.which) static inline void sk_msg_init(struct sk_msg *msg) { BUILD_BUG_ON(ARRAY_SIZE(msg->sg.data) - 1 != NR_MSG_FRAG_IDS); memset(msg, 0, sizeof(*msg)); sg_init_marker(msg->sg.data, NR_MSG_FRAG_IDS); } static inline void sk_msg_xfer(struct sk_msg *dst, struct sk_msg *src, int which, u32 size) { dst->sg.data[which] = src->sg.data[which]; dst->sg.data[which].length = size; dst->sg.size += size; src->sg.size -= size; src->sg.data[which].length -= size; src->sg.data[which].offset += size; } static inline void sk_msg_xfer_full(struct sk_msg *dst, struct sk_msg *src) { memcpy(dst, src, sizeof(*src)); sk_msg_init(src); } static inline bool sk_msg_full(const struct sk_msg *msg) { return sk_msg_iter_dist(msg->sg.start, msg->sg.end) == MAX_MSG_FRAGS; } static inline u32 sk_msg_elem_used(const struct sk_msg *msg) { return sk_msg_iter_dist(msg->sg.start, msg->sg.end); } static inline struct scatterlist *sk_msg_elem(struct sk_msg *msg, int which) { return &msg->sg.data[which]; } static inline struct scatterlist sk_msg_elem_cpy(struct sk_msg *msg, int which) { return msg->sg.data[which]; } static inline struct page *sk_msg_page(struct sk_msg *msg, int which) { return sg_page(sk_msg_elem(msg, which)); } static inline bool sk_msg_to_ingress(const struct sk_msg *msg) { return msg->flags & BPF_F_INGRESS; } static inline void sk_msg_compute_data_pointers(struct sk_msg *msg) { struct scatterlist *sge = sk_msg_elem(msg, msg->sg.start); if (test_bit(msg->sg.start, msg->sg.copy)) { msg->data = NULL; msg->data_end = NULL; } else { msg->data = sg_virt(sge); msg->data_end = msg->data + sge->length; } } static inline void sk_msg_page_add(struct sk_msg *msg, struct page *page, u32 len, u32 offset) { struct scatterlist *sge; get_page(page); sge = sk_msg_elem(msg, msg->sg.end); sg_set_page(sge, page, len, offset); sg_unmark_end(sge); __set_bit(msg->sg.end, msg->sg.copy); msg->sg.size += len; sk_msg_iter_next(msg, end); } static inline void sk_msg_sg_copy(struct sk_msg *msg, u32 i, bool copy_state) { do { if (copy_state) __set_bit(i, msg->sg.copy); else __clear_bit(i, msg->sg.copy); sk_msg_iter_var_next(i); if (i == msg->sg.end) break; } while (1); } static inline void sk_msg_sg_copy_set(struct sk_msg *msg, u32 start) { sk_msg_sg_copy(msg, start, true); } static inline void sk_msg_sg_copy_clear(struct sk_msg *msg, u32 start) { sk_msg_sg_copy(msg, start, false); } static inline struct sk_psock *sk_psock(const struct sock *sk) { return __rcu_dereference_sk_user_data_with_flags(sk, SK_USER_DATA_PSOCK); } static inline void sk_psock_set_state(struct sk_psock *psock, enum sk_psock_state_bits bit) { set_bit(bit, &psock->state); } static inline void sk_psock_clear_state(struct sk_psock *psock, enum sk_psock_state_bits bit) { clear_bit(bit, &psock->state); } static inline bool sk_psock_test_state(const struct sk_psock *psock, enum sk_psock_state_bits bit) { return test_bit(bit, &psock->state); } static inline void sock_drop(struct sock *sk, struct sk_buff *skb) { sk_drops_add(sk, skb); kfree_skb(skb); } static inline void sk_psock_queue_msg(struct sk_psock *psock, struct sk_msg *msg) { spin_lock_bh(&psock->ingress_lock); if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) list_add_tail(&msg->list, &psock->ingress_msg); else { sk_msg_free(psock->sk, msg); kfree(msg); } spin_unlock_bh(&psock->ingress_lock); } static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock) { struct sk_msg *msg; spin_lock_bh(&psock->ingress_lock); msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); if (msg) list_del(&msg->list); spin_unlock_bh(&psock->ingress_lock); return msg; } static inline struct sk_msg *sk_psock_peek_msg(struct sk_psock *psock) { struct sk_msg *msg; spin_lock_bh(&psock->ingress_lock); msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); spin_unlock_bh(&psock->ingress_lock); return msg; } static inline struct sk_msg *sk_psock_next_msg(struct sk_psock *psock, struct sk_msg *msg) { struct sk_msg *ret; spin_lock_bh(&psock->ingress_lock); if (list_is_last(&msg->list, &psock->ingress_msg)) ret = NULL; else ret = list_next_entry(msg, list); spin_unlock_bh(&psock->ingress_lock); return ret; } static inline bool sk_psock_queue_empty(const struct sk_psock *psock) { return psock ? list_empty(&psock->ingress_msg) : true; } static inline void kfree_sk_msg(struct sk_msg *msg) { if (msg->skb) consume_skb(msg->skb); kfree(msg); } static inline void sk_psock_report_error(struct sk_psock *psock, int err) { struct sock *sk = psock->sk; sk->sk_err = err; sk_error_report(sk); } struct sk_psock *sk_psock_init(struct sock *sk, int node); void sk_psock_stop(struct sk_psock *psock); #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock); void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock); void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock); #else static inline int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) { return -EOPNOTSUPP; } static inline void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) { } static inline void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) { } #endif void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock); void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock); int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg); static inline struct sk_psock_link *sk_psock_init_link(void) { return kzalloc(sizeof(struct sk_psock_link), GFP_ATOMIC | __GFP_NOWARN); } static inline void sk_psock_free_link(struct sk_psock_link *link) { kfree(link); } struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock); static inline void sk_psock_cork_free(struct sk_psock *psock) { if (psock->cork) { sk_msg_free(psock->sk, psock->cork); kfree(psock->cork); psock->cork = NULL; } } static inline void sk_psock_restore_proto(struct sock *sk, struct sk_psock *psock) { if (psock->psock_update_sk_prot) psock->psock_update_sk_prot(sk, psock, true); } static inline struct sk_psock *sk_psock_get(struct sock *sk) { struct sk_psock *psock; rcu_read_lock(); psock = sk_psock(sk); if (psock && !refcount_inc_not_zero(&psock->refcnt)) psock = NULL; rcu_read_unlock(); return psock; } void sk_psock_drop(struct sock *sk, struct sk_psock *psock); static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock) { if (refcount_dec_and_test(&psock->refcnt)) sk_psock_drop(sk, psock); } static inline void sk_psock_data_ready(struct sock *sk, struct sk_psock *psock) { if (psock->saved_data_ready) psock->saved_data_ready(sk); else sk->sk_data_ready(sk); } static inline void psock_set_prog(struct bpf_prog **pprog, struct bpf_prog *prog) { prog = xchg(pprog, prog); if (prog) bpf_prog_put(prog); } static inline int psock_replace_prog(struct bpf_prog **pprog, struct bpf_prog *prog, struct bpf_prog *old) { if (cmpxchg(pprog, old, prog) != old) return -ENOENT; if (old) bpf_prog_put(old); return 0; } static inline void psock_progs_drop(struct sk_psock_progs *progs) { psock_set_prog(&progs->msg_parser, NULL); psock_set_prog(&progs->stream_parser, NULL); psock_set_prog(&progs->stream_verdict, NULL); psock_set_prog(&progs->skb_verdict, NULL); } int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb); static inline bool sk_psock_strp_enabled(struct sk_psock *psock) { if (!psock) return false; return !!psock->saved_data_ready; } static inline bool sk_is_udp(const struct sock *sk) { return sk->sk_type == SOCK_DGRAM && sk->sk_protocol == IPPROTO_UDP; } #if IS_ENABLED(CONFIG_NET_SOCK_MSG) #define BPF_F_STRPARSER (1UL << 1) /* We only have two bits so far. */ #define BPF_F_PTR_MASK ~(BPF_F_INGRESS | BPF_F_STRPARSER) static inline bool skb_bpf_strparser(const struct sk_buff *skb) { unsigned long sk_redir = skb->_sk_redir; return sk_redir & BPF_F_STRPARSER; } static inline void skb_bpf_set_strparser(struct sk_buff *skb) { skb->_sk_redir |= BPF_F_STRPARSER; } static inline bool skb_bpf_ingress(const struct sk_buff *skb) { unsigned long sk_redir = skb->_sk_redir; return sk_redir & BPF_F_INGRESS; } static inline void skb_bpf_set_ingress(struct sk_buff *skb) { skb->_sk_redir |= BPF_F_INGRESS; } static inline void skb_bpf_set_redir(struct sk_buff *skb, struct sock *sk_redir, bool ingress) { skb->_sk_redir = (unsigned long)sk_redir; if (ingress) skb->_sk_redir |= BPF_F_INGRESS; } static inline struct sock *skb_bpf_redirect_fetch(const struct sk_buff *skb) { unsigned long sk_redir = skb->_sk_redir; return (struct sock *)(sk_redir & BPF_F_PTR_MASK); } static inline void skb_bpf_redirect_clear(struct sk_buff *skb) { skb->_sk_redir = 0; } #endif /* CONFIG_NET_SOCK_MSG */ #endif /* _LINUX_SKMSG_H */ |
1 1 1 1 1 3 3 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 | // SPDX-License-Identifier: GPL-2.0 /* * Supplementary group IDs */ #include <linux/cred.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/security.h> #include <linux/sort.h> #include <linux/syscalls.h> #include <linux/user_namespace.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> struct group_info *groups_alloc(int gidsetsize) { struct group_info *gi; gi = kvmalloc(struct_size(gi, gid, gidsetsize), GFP_KERNEL_ACCOUNT); if (!gi) return NULL; atomic_set(&gi->usage, 1); gi->ngroups = gidsetsize; return gi; } EXPORT_SYMBOL(groups_alloc); void groups_free(struct group_info *group_info) { kvfree(group_info); } EXPORT_SYMBOL(groups_free); /* export the group_info to a user-space array */ static int groups_to_user(gid_t __user *grouplist, const struct group_info *group_info) { struct user_namespace *user_ns = current_user_ns(); int i; unsigned int count = group_info->ngroups; for (i = 0; i < count; i++) { gid_t gid; gid = from_kgid_munged(user_ns, group_info->gid[i]); if (put_user(gid, grouplist+i)) return -EFAULT; } return 0; } /* fill a group_info from a user-space array - it must be allocated already */ static int groups_from_user(struct group_info *group_info, gid_t __user *grouplist) { struct user_namespace *user_ns = current_user_ns(); int i; unsigned int count = group_info->ngroups; for (i = 0; i < count; i++) { gid_t gid; kgid_t kgid; if (get_user(gid, grouplist+i)) return -EFAULT; kgid = make_kgid(user_ns, gid); if (!gid_valid(kgid)) return -EINVAL; group_info->gid[i] = kgid; } return 0; } static int gid_cmp(const void *_a, const void *_b) { kgid_t a = *(kgid_t *)_a; kgid_t b = *(kgid_t *)_b; return gid_gt(a, b) - gid_lt(a, b); } void groups_sort(struct group_info *group_info) { sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid), gid_cmp, NULL); } EXPORT_SYMBOL(groups_sort); /* a simple bsearch */ int groups_search(const struct group_info *group_info, kgid_t grp) { unsigned int left, right; if (!group_info) return 0; left = 0; right = group_info->ngroups; while (left < right) { unsigned int mid = (left+right)/2; if (gid_gt(grp, group_info->gid[mid])) left = mid + 1; else if (gid_lt(grp, group_info->gid[mid])) right = mid; else return 1; } return 0; } /** * set_groups - Change a group subscription in a set of credentials * @new: The newly prepared set of credentials to alter * @group_info: The group list to install */ void set_groups(struct cred *new, struct group_info *group_info) { put_group_info(new->group_info); get_group_info(group_info); new->group_info = group_info; } EXPORT_SYMBOL(set_groups); /** * set_current_groups - Change current's group subscription * @group_info: The group list to impose * * Validate a group subscription and, if valid, impose it upon current's task * security record. */ int set_current_groups(struct group_info *group_info) { struct cred *new; const struct cred *old; int retval; new = prepare_creds(); if (!new) return -ENOMEM; old = current_cred(); set_groups(new, group_info); retval = security_task_fix_setgroups(new, old); if (retval < 0) goto error; return commit_creds(new); error: abort_creds(new); return retval; } EXPORT_SYMBOL(set_current_groups); SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist) { const struct cred *cred = current_cred(); int i; if (gidsetsize < 0) return -EINVAL; /* no need to grab task_lock here; it cannot change */ i = cred->group_info->ngroups; if (gidsetsize) { if (i > gidsetsize) { i = -EINVAL; goto out; } if (groups_to_user(grouplist, cred->group_info)) { i = -EFAULT; goto out; } } out: return i; } bool may_setgroups(void) { struct user_namespace *user_ns = current_user_ns(); return ns_capable_setid(user_ns, CAP_SETGID) && userns_may_setgroups(user_ns); } /* * SMP: Our groups are copy-on-write. We can set them safely * without another task interfering. */ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) { struct group_info *group_info; int retval; if (!may_setgroups()) return -EPERM; if ((unsigned)gidsetsize > NGROUPS_MAX) return -EINVAL; group_info = groups_alloc(gidsetsize); if (!group_info) return -ENOMEM; retval = groups_from_user(group_info, grouplist); if (retval) { put_group_info(group_info); return retval; } groups_sort(group_info); retval = set_current_groups(group_info); put_group_info(group_info); return retval; } /* * Check whether we're fsgid/egid or in the supplemental group.. */ int in_group_p(kgid_t grp) { const struct cred *cred = current_cred(); int retval = 1; if (!gid_eq(grp, cred->fsgid)) retval = groups_search(cred->group_info, grp); return retval; } EXPORT_SYMBOL(in_group_p); int in_egroup_p(kgid_t grp) { const struct cred *cred = current_cred(); int retval = 1; if (!gid_eq(grp, cred->egid)) retval = groups_search(cred->group_info, grp); return retval; } EXPORT_SYMBOL(in_egroup_p); |
35 35 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 | // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/gen_estimator.c Simple rate estimator. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * Eric Dumazet <edumazet@google.com> * * Changes: * Jamal Hadi Salim - moved it to net/core and reshulfed * names to make it usable in general net subsystem. */ #include <linux/uaccess.h> #include <linux/bitops.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/in.h> #include <linux/errno.h> #include <linux/interrupt.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/seqlock.h> #include <net/sock.h> #include <net/gen_stats.h> /* This code is NOT intended to be used for statistics collection, * its purpose is to provide a base for statistical multiplexing * for controlled load service. * If you need only statistics, run a user level daemon which * periodically reads byte counters. */ struct net_rate_estimator { struct gnet_stats_basic_sync *bstats; spinlock_t *stats_lock; bool running; struct gnet_stats_basic_sync __percpu *cpu_bstats; u8 ewma_log; u8 intvl_log; /* period : (250ms << intvl_log) */ seqcount_t seq; u64 last_packets; u64 last_bytes; u64 avpps; u64 avbps; unsigned long next_jiffies; struct timer_list timer; struct rcu_head rcu; }; static void est_fetch_counters(struct net_rate_estimator *e, struct gnet_stats_basic_sync *b) { gnet_stats_basic_sync_init(b); if (e->stats_lock) spin_lock(e->stats_lock); gnet_stats_add_basic(b, e->cpu_bstats, e->bstats, e->running); if (e->stats_lock) spin_unlock(e->stats_lock); } static void est_timer(struct timer_list *t) { struct net_rate_estimator *est = from_timer(est, t, timer); struct gnet_stats_basic_sync b; u64 b_bytes, b_packets; u64 rate, brate; est_fetch_counters(est, &b); b_bytes = u64_stats_read(&b.bytes); b_packets = u64_stats_read(&b.packets); brate = (b_bytes - est->last_bytes) << (10 - est->intvl_log); brate = (brate >> est->ewma_log) - (est->avbps >> est->ewma_log); rate = (b_packets - est->last_packets) << (10 - est->intvl_log); rate = (rate >> est->ewma_log) - (est->avpps >> est->ewma_log); write_seqcount_begin(&est->seq); est->avbps += brate; est->avpps += rate; write_seqcount_end(&est->seq); est->last_bytes = b_bytes; est->last_packets = b_packets; est->next_jiffies += ((HZ/4) << est->intvl_log); if (unlikely(time_after_eq(jiffies, est->next_jiffies))) { /* Ouch... timer was delayed. */ est->next_jiffies = jiffies + 1; } mod_timer(&est->timer, est->next_jiffies); } /** * gen_new_estimator - create a new rate estimator * @bstats: basic statistics * @cpu_bstats: bstats per cpu * @rate_est: rate estimator statistics * @lock: lock for statistics and control path * @running: true if @bstats represents a running qdisc, thus @bstats' * internal values might change during basic reads. Only used * if @bstats_cpu is NULL * @opt: rate estimator configuration TLV * * Creates a new rate estimator with &bstats as source and &rate_est * as destination. A new timer with the interval specified in the * configuration TLV is created. Upon each interval, the latest statistics * will be read from &bstats and the estimated rate will be stored in * &rate_est with the statistics lock grabbed during this period. * * Returns 0 on success or a negative error code. * */ int gen_new_estimator(struct gnet_stats_basic_sync *bstats, struct gnet_stats_basic_sync __percpu *cpu_bstats, struct net_rate_estimator __rcu **rate_est, spinlock_t *lock, bool running, struct nlattr *opt) { struct gnet_estimator *parm = nla_data(opt); struct net_rate_estimator *old, *est; struct gnet_stats_basic_sync b; int intvl_log; if (nla_len(opt) < sizeof(*parm)) return -EINVAL; /* allowed timer periods are : * -2 : 250ms, -1 : 500ms, 0 : 1 sec * 1 : 2 sec, 2 : 4 sec, 3 : 8 sec */ if (parm->interval < -2 || parm->interval > 3) return -EINVAL; if (parm->ewma_log == 0 || parm->ewma_log >= 31) return -EINVAL; est = kzalloc(sizeof(*est), GFP_KERNEL); if (!est) return -ENOBUFS; seqcount_init(&est->seq); intvl_log = parm->interval + 2; est->bstats = bstats; est->stats_lock = lock; est->running = running; est->ewma_log = parm->ewma_log; est->intvl_log = intvl_log; est->cpu_bstats = cpu_bstats; if (lock) local_bh_disable(); est_fetch_counters(est, &b); if (lock) local_bh_enable(); est->last_bytes = u64_stats_read(&b.bytes); est->last_packets = u64_stats_read(&b.packets); if (lock) spin_lock_bh(lock); old = rcu_dereference_protected(*rate_est, 1); if (old) { del_timer_sync(&old->timer); est->avbps = old->avbps; est->avpps = old->avpps; } est->next_jiffies = jiffies + ((HZ/4) << intvl_log); timer_setup(&est->timer, est_timer, 0); mod_timer(&est->timer, est->next_jiffies); rcu_assign_pointer(*rate_est, est); if (lock) spin_unlock_bh(lock); if (old) kfree_rcu(old, rcu); return 0; } EXPORT_SYMBOL(gen_new_estimator); /** * gen_kill_estimator - remove a rate estimator * @rate_est: rate estimator * * Removes the rate estimator. * */ void gen_kill_estimator(struct net_rate_estimator __rcu **rate_est) { struct net_rate_estimator *est; est = xchg((__force struct net_rate_estimator **)rate_est, NULL); if (est) { timer_shutdown_sync(&est->timer); kfree_rcu(est, rcu); } } EXPORT_SYMBOL(gen_kill_estimator); /** * gen_replace_estimator - replace rate estimator configuration * @bstats: basic statistics * @cpu_bstats: bstats per cpu * @rate_est: rate estimator statistics * @lock: lock for statistics and control path * @running: true if @bstats represents a running qdisc, thus @bstats' * internal values might change during basic reads. Only used * if @cpu_bstats is NULL * @opt: rate estimator configuration TLV * * Replaces the configuration of a rate estimator by calling * gen_kill_estimator() and gen_new_estimator(). * * Returns 0 on success or a negative error code. */ int gen_replace_estimator(struct gnet_stats_basic_sync *bstats, struct gnet_stats_basic_sync __percpu *cpu_bstats, struct net_rate_estimator __rcu **rate_est, spinlock_t *lock, bool running, struct nlattr *opt) { return gen_new_estimator(bstats, cpu_bstats, rate_est, lock, running, opt); } EXPORT_SYMBOL(gen_replace_estimator); /** * gen_estimator_active - test if estimator is currently in use * @rate_est: rate estimator * * Returns true if estimator is active, and false if not. */ bool gen_estimator_active(struct net_rate_estimator __rcu **rate_est) { return !!rcu_access_pointer(*rate_est); } EXPORT_SYMBOL(gen_estimator_active); bool gen_estimator_read(struct net_rate_estimator __rcu **rate_est, struct gnet_stats_rate_est64 *sample) { struct net_rate_estimator *est; unsigned seq; rcu_read_lock(); est = rcu_dereference(*rate_est); if (!est) { rcu_read_unlock(); return false; } do { seq = read_seqcount_begin(&est->seq); sample->bps = est->avbps >> 8; sample->pps = est->avpps >> 8; } while (read_seqcount_retry(&est->seq, seq)); rcu_read_unlock(); return true; } EXPORT_SYMBOL(gen_estimator_read); |
24 24 24 24 24 24 28 28 28 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 | // SPDX-License-Identifier: GPL-2.0 #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/netdevice.h> #include <net/gro_cells.h> struct gro_cell { struct sk_buff_head napi_skbs; struct napi_struct napi; }; int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb) { struct net_device *dev = skb->dev; struct gro_cell *cell; int res; rcu_read_lock(); if (unlikely(!(dev->flags & IFF_UP))) goto drop; if (!gcells->cells || skb_cloned(skb) || netif_elide_gro(dev)) { res = netif_rx(skb); goto unlock; } cell = this_cpu_ptr(gcells->cells); if (skb_queue_len(&cell->napi_skbs) > READ_ONCE(netdev_max_backlog)) { drop: dev_core_stats_rx_dropped_inc(dev); kfree_skb(skb); res = NET_RX_DROP; goto unlock; } __skb_queue_tail(&cell->napi_skbs, skb); if (skb_queue_len(&cell->napi_skbs) == 1) napi_schedule(&cell->napi); res = NET_RX_SUCCESS; unlock: rcu_read_unlock(); return res; } EXPORT_SYMBOL(gro_cells_receive); /* called under BH context */ static int gro_cell_poll(struct napi_struct *napi, int budget) { struct gro_cell *cell = container_of(napi, struct gro_cell, napi); struct sk_buff *skb; int work_done = 0; while (work_done < budget) { skb = __skb_dequeue(&cell->napi_skbs); if (!skb) break; napi_gro_receive(napi, skb); work_done++; } if (work_done < budget) napi_complete_done(napi, work_done); return work_done; } int gro_cells_init(struct gro_cells *gcells, struct net_device *dev) { int i; gcells->cells = alloc_percpu(struct gro_cell); if (!gcells->cells) return -ENOMEM; for_each_possible_cpu(i) { struct gro_cell *cell = per_cpu_ptr(gcells->cells, i); __skb_queue_head_init(&cell->napi_skbs); set_bit(NAPI_STATE_NO_BUSY_POLL, &cell->napi.state); netif_napi_add(dev, &cell->napi, gro_cell_poll); napi_enable(&cell->napi); } return 0; } EXPORT_SYMBOL(gro_cells_init); struct percpu_free_defer { struct rcu_head rcu; void __percpu *ptr; }; static void percpu_free_defer_callback(struct rcu_head *head) { struct percpu_free_defer *defer; defer = container_of(head, struct percpu_free_defer, rcu); free_percpu(defer->ptr); kfree(defer); } void gro_cells_destroy(struct gro_cells *gcells) { struct percpu_free_defer *defer; int i; if (!gcells->cells) return; for_each_possible_cpu(i) { struct gro_cell *cell = per_cpu_ptr(gcells->cells, i); napi_disable(&cell->napi); __netif_napi_del(&cell->napi); __skb_queue_purge(&cell->napi_skbs); } /* We need to observe an rcu grace period before freeing ->cells, * because netpoll could access dev->napi_list under rcu protection. * Try hard using call_rcu() instead of synchronize_rcu(), * because we might be called from cleanup_net(), and we * definitely do not want to block this critical task. */ defer = kmalloc(sizeof(*defer), GFP_KERNEL | __GFP_NOWARN); if (likely(defer)) { defer->ptr = gcells->cells; call_rcu(&defer->rcu, percpu_free_defer_callback); } else { /* We do not hold RTNL at this point, synchronize_net() * would not be able to expedite this sync. */ synchronize_rcu_expedited(); free_percpu(gcells->cells); } gcells->cells = NULL; } EXPORT_SYMBOL(gro_cells_destroy); |
10 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Definitions for the 'struct skb_array' datastructure. * * Author: * Michael S. Tsirkin <mst@redhat.com> * * Copyright (C) 2016 Red Hat, Inc. * * Limited-size FIFO of skbs. Can be used more or less whenever * sk_buff_head can be used, except you need to know the queue size in * advance. * Implemented as a type-safe wrapper around ptr_ring. */ #ifndef _LINUX_SKB_ARRAY_H #define _LINUX_SKB_ARRAY_H 1 #ifdef __KERNEL__ #include <linux/ptr_ring.h> #include <linux/skbuff.h> #include <linux/if_vlan.h> #endif struct skb_array { struct ptr_ring ring; }; /* Might be slightly faster than skb_array_full below, but callers invoking * this in a loop must use a compiler barrier, for example cpu_relax(). */ static inline bool __skb_array_full(struct skb_array *a) { return __ptr_ring_full(&a->ring); } static inline bool skb_array_full(struct skb_array *a) { return ptr_ring_full(&a->ring); } static inline int skb_array_produce(struct skb_array *a, struct sk_buff *skb) { return ptr_ring_produce(&a->ring, skb); } static inline int skb_array_produce_irq(struct skb_array *a, struct sk_buff *skb) { return ptr_ring_produce_irq(&a->ring, skb); } static inline int skb_array_produce_bh(struct skb_array *a, struct sk_buff *skb) { return ptr_ring_produce_bh(&a->ring, skb); } static inline int skb_array_produce_any(struct skb_array *a, struct sk_buff *skb) { return ptr_ring_produce_any(&a->ring, skb); } /* Might be slightly faster than skb_array_empty below, but only safe if the * array is never resized. Also, callers invoking this in a loop must take care * to use a compiler barrier, for example cpu_relax(). */ static inline bool __skb_array_empty(struct skb_array *a) { return __ptr_ring_empty(&a->ring); } static inline struct sk_buff *__skb_array_peek(struct skb_array *a) { return __ptr_ring_peek(&a->ring); } static inline bool skb_array_empty(struct skb_array *a) { return ptr_ring_empty(&a->ring); } static inline bool skb_array_empty_bh(struct skb_array *a) { return ptr_ring_empty_bh(&a->ring); } static inline bool skb_array_empty_irq(struct skb_array *a) { return ptr_ring_empty_irq(&a->ring); } static inline bool skb_array_empty_any(struct skb_array *a) { return ptr_ring_empty_any(&a->ring); } static inline struct sk_buff *__skb_array_consume(struct skb_array *a) { return __ptr_ring_consume(&a->ring); } static inline struct sk_buff *skb_array_consume(struct skb_array *a) { return ptr_ring_consume(&a->ring); } static inline int skb_array_consume_batched(struct skb_array *a, struct sk_buff **array, int n) { return ptr_ring_consume_batched(&a->ring, (void **)array, n); } static inline struct sk_buff *skb_array_consume_irq(struct skb_array *a) { return ptr_ring_consume_irq(&a->ring); } static inline int skb_array_consume_batched_irq(struct skb_array *a, struct sk_buff **array, int n) { return ptr_ring_consume_batched_irq(&a->ring, (void **)array, n); } static inline struct sk_buff *skb_array_consume_any(struct skb_array *a) { return ptr_ring_consume_any(&a->ring); } static inline int skb_array_consume_batched_any(struct skb_array *a, struct sk_buff **array, int n) { return ptr_ring_consume_batched_any(&a->ring, (void **)array, n); } static inline struct sk_buff *skb_array_consume_bh(struct skb_array *a) { return ptr_ring_consume_bh(&a->ring); } static inline int skb_array_consume_batched_bh(struct skb_array *a, struct sk_buff **array, int n) { return ptr_ring_consume_batched_bh(&a->ring, (void **)array, n); } static inline int __skb_array_len_with_tag(struct sk_buff *skb) { if (likely(skb)) { int len = skb->len; if (skb_vlan_tag_present(skb)) len += VLAN_HLEN; return len; } else { return 0; } } static inline int skb_array_peek_len(struct skb_array *a) { return PTR_RING_PEEK_CALL(&a->ring, __skb_array_len_with_tag); } static inline int skb_array_peek_len_irq(struct skb_array *a) { return PTR_RING_PEEK_CALL_IRQ(&a->ring, __skb_array_len_with_tag); } static inline int skb_array_peek_len_bh(struct skb_array *a) { return PTR_RING_PEEK_CALL_BH(&a->ring, __skb_array_len_with_tag); } static inline int skb_array_peek_len_any(struct skb_array *a) { return PTR_RING_PEEK_CALL_ANY(&a->ring, __skb_array_len_with_tag); } static inline int skb_array_init(struct skb_array *a, int size, gfp_t gfp) { return ptr_ring_init(&a->ring, size, gfp); } static void __skb_array_destroy_skb(void *ptr) { kfree_skb(ptr); } static inline void skb_array_unconsume(struct skb_array *a, struct sk_buff **skbs, int n) { ptr_ring_unconsume(&a->ring, (void **)skbs, n, __skb_array_destroy_skb); } static inline int skb_array_resize(struct skb_array *a, int size, gfp_t gfp) { return ptr_ring_resize(&a->ring, size, gfp, __skb_array_destroy_skb); } static inline int skb_array_resize_multiple(struct skb_array **rings, int nrings, unsigned int size, gfp_t gfp) { BUILD_BUG_ON(offsetof(struct skb_array, ring)); return ptr_ring_resize_multiple((struct ptr_ring **)rings, nrings, size, gfp, __skb_array_destroy_skb); } static inline void skb_array_cleanup(struct skb_array *a) { ptr_ring_cleanup(&a->ring, __skb_array_destroy_skb); } #endif /* _LINUX_SKB_ARRAY_H */ |
145 133 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 | // SPDX-License-Identifier: GPL-2.0 /* * shstk.c - Intel shadow stack support * * Copyright (c) 2021, Intel Corporation. * Yu-cheng Yu <yu-cheng.yu@intel.com> */ #include <linux/sched.h> #include <linux/bitops.h> #include <linux/types.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/sched/signal.h> #include <linux/compat.h> #include <linux/sizes.h> #include <linux/user.h> #include <linux/syscalls.h> #include <asm/msr.h> #include <asm/fpu/xstate.h> #include <asm/fpu/types.h> #include <asm/shstk.h> #include <asm/special_insns.h> #include <asm/fpu/api.h> #include <asm/prctl.h> #define SS_FRAME_SIZE 8 static bool features_enabled(unsigned long features) { return current->thread.features & features; } static void features_set(unsigned long features) { current->thread.features |= features; } static void features_clr(unsigned long features) { current->thread.features &= ~features; } /* * Create a restore token on the shadow stack. A token is always 8-byte * and aligned to 8. */ static int create_rstor_token(unsigned long ssp, unsigned long *token_addr) { unsigned long addr; /* Token must be aligned */ if (!IS_ALIGNED(ssp, 8)) return -EINVAL; addr = ssp - SS_FRAME_SIZE; /* * SSP is aligned, so reserved bits and mode bit are a zero, just mark * the token 64-bit. */ ssp |= BIT(0); if (write_user_shstk_64((u64 __user *)addr, (u64)ssp)) return -EFAULT; if (token_addr) *token_addr = addr; return 0; } /* * VM_SHADOW_STACK will have a guard page. This helps userspace protect * itself from attacks. The reasoning is as follows: * * The shadow stack pointer(SSP) is moved by CALL, RET, and INCSSPQ. The * INCSSP instruction can increment the shadow stack pointer. It is the * shadow stack analog of an instruction like: * * addq $0x80, %rsp * * However, there is one important difference between an ADD on %rsp * and INCSSP. In addition to modifying SSP, INCSSP also reads from the * memory of the first and last elements that were "popped". It can be * thought of as acting like this: * * READ_ONCE(ssp); // read+discard top element on stack * ssp += nr_to_pop * 8; // move the shadow stack * READ_ONCE(ssp-8); // read+discard last popped stack element * * The maximum distance INCSSP can move the SSP is 2040 bytes, before * it would read the memory. Therefore a single page gap will be enough * to prevent any operation from shifting the SSP to an adjacent stack, * since it would have to land in the gap at least once, causing a * fault. */ static unsigned long alloc_shstk(unsigned long addr, unsigned long size, unsigned long token_offset, bool set_res_tok) { int flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_ABOVE4G; struct mm_struct *mm = current->mm; unsigned long mapped_addr, unused; if (addr) flags |= MAP_FIXED_NOREPLACE; mmap_write_lock(mm); mapped_addr = do_mmap(NULL, addr, size, PROT_READ, flags, VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL); mmap_write_unlock(mm); if (!set_res_tok || IS_ERR_VALUE(mapped_addr)) goto out; if (create_rstor_token(mapped_addr + token_offset, NULL)) { vm_munmap(mapped_addr, size); return -EINVAL; } out: return mapped_addr; } static unsigned long adjust_shstk_size(unsigned long size) { if (size) return PAGE_ALIGN(size); return PAGE_ALIGN(min_t(unsigned long long, rlimit(RLIMIT_STACK), SZ_4G)); } static void unmap_shadow_stack(u64 base, u64 size) { int r; r = vm_munmap(base, size); /* * mmap_write_lock_killable() failed with -EINTR. This means * the process is about to die and have it's MM cleaned up. * This task shouldn't ever make it back to userspace. In this * case it is ok to leak a shadow stack, so just exit out. */ if (r == -EINTR) return; /* * For all other types of vm_munmap() failure, either the * system is out of memory or there is bug. */ WARN_ON_ONCE(r); } static int shstk_setup(void) { struct thread_shstk *shstk = ¤t->thread.shstk; unsigned long addr, size; /* Already enabled */ if (features_enabled(ARCH_SHSTK_SHSTK)) return 0; /* Also not supported for 32 bit and x32 */ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || in_32bit_syscall()) return -EOPNOTSUPP; size = adjust_shstk_size(0); addr = alloc_shstk(0, size, 0, false); if (IS_ERR_VALUE(addr)) return PTR_ERR((void *)addr); fpregs_lock_and_load(); wrmsrl(MSR_IA32_PL3_SSP, addr + size); wrmsrl(MSR_IA32_U_CET, CET_SHSTK_EN); fpregs_unlock(); shstk->base = addr; shstk->size = size; features_set(ARCH_SHSTK_SHSTK); return 0; } void reset_thread_features(void) { memset(¤t->thread.shstk, 0, sizeof(struct thread_shstk)); current->thread.features = 0; current->thread.features_locked = 0; } unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, unsigned long clone_flags, unsigned long stack_size) { struct thread_shstk *shstk = &tsk->thread.shstk; unsigned long addr, size; /* * If shadow stack is not enabled on the new thread, skip any * switch to a new shadow stack. */ if (!features_enabled(ARCH_SHSTK_SHSTK)) return 0; /* * For CLONE_VFORK the child will share the parents shadow stack. * Make sure to clear the internal tracking of the thread shadow * stack so the freeing logic run for child knows to leave it alone. */ if (clone_flags & CLONE_VFORK) { shstk->base = 0; shstk->size = 0; return 0; } /* * For !CLONE_VM the child will use a copy of the parents shadow * stack. */ if (!(clone_flags & CLONE_VM)) return 0; size = adjust_shstk_size(stack_size); addr = alloc_shstk(0, size, 0, false); if (IS_ERR_VALUE(addr)) return addr; shstk->base = addr; shstk->size = size; return addr + size; } static unsigned long get_user_shstk_addr(void) { unsigned long long ssp; fpregs_lock_and_load(); rdmsrl(MSR_IA32_PL3_SSP, ssp); fpregs_unlock(); return ssp; } #define SHSTK_DATA_BIT BIT(63) static int put_shstk_data(u64 __user *addr, u64 data) { if (WARN_ON_ONCE(data & SHSTK_DATA_BIT)) return -EINVAL; /* * Mark the high bit so that the sigframe can't be processed as a * return address. */ if (write_user_shstk_64(addr, data | SHSTK_DATA_BIT)) return -EFAULT; return 0; } static int get_shstk_data(unsigned long *data, unsigned long __user *addr) { unsigned long ldata; if (unlikely(get_user(ldata, addr))) return -EFAULT; if (!(ldata & SHSTK_DATA_BIT)) return -EINVAL; *data = ldata & ~SHSTK_DATA_BIT; return 0; } static int shstk_push_sigframe(unsigned long *ssp) { unsigned long target_ssp = *ssp; /* Token must be aligned */ if (!IS_ALIGNED(target_ssp, 8)) return -EINVAL; *ssp -= SS_FRAME_SIZE; if (put_shstk_data((void __user *)*ssp, target_ssp)) return -EFAULT; return 0; } static int shstk_pop_sigframe(unsigned long *ssp) { struct vm_area_struct *vma; unsigned long token_addr; bool need_to_check_vma; int err = 1; /* * It is possible for the SSP to be off the end of a shadow stack by 4 * or 8 bytes. If the shadow stack is at the start of a page or 4 bytes * before it, it might be this case, so check that the address being * read is actually shadow stack. */ if (!IS_ALIGNED(*ssp, 8)) return -EINVAL; need_to_check_vma = PAGE_ALIGN(*ssp) == *ssp; if (need_to_check_vma) mmap_read_lock_killable(current->mm); err = get_shstk_data(&token_addr, (unsigned long __user *)*ssp); if (unlikely(err)) goto out_err; if (need_to_check_vma) { vma = find_vma(current->mm, *ssp); if (!vma || !(vma->vm_flags & VM_SHADOW_STACK)) { err = -EFAULT; goto out_err; } mmap_read_unlock(current->mm); } /* Restore SSP aligned? */ if (unlikely(!IS_ALIGNED(token_addr, 8))) return -EINVAL; /* SSP in userspace? */ if (unlikely(token_addr >= TASK_SIZE_MAX)) return -EINVAL; *ssp = token_addr; return 0; out_err: if (need_to_check_vma) mmap_read_unlock(current->mm); return err; } int setup_signal_shadow_stack(struct ksignal *ksig) { void __user *restorer = ksig->ka.sa.sa_restorer; unsigned long ssp; int err; if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || !features_enabled(ARCH_SHSTK_SHSTK)) return 0; if (!restorer) return -EINVAL; ssp = get_user_shstk_addr(); if (unlikely(!ssp)) return -EINVAL; err = shstk_push_sigframe(&ssp); if (unlikely(err)) return err; /* Push restorer address */ ssp -= SS_FRAME_SIZE; err = write_user_shstk_64((u64 __user *)ssp, (u64)restorer); if (unlikely(err)) return -EFAULT; fpregs_lock_and_load(); wrmsrl(MSR_IA32_PL3_SSP, ssp); fpregs_unlock(); return 0; } int restore_signal_shadow_stack(void) { unsigned long ssp; int err; if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || !features_enabled(ARCH_SHSTK_SHSTK)) return 0; ssp = get_user_shstk_addr(); if (unlikely(!ssp)) return -EINVAL; err = shstk_pop_sigframe(&ssp); if (unlikely(err)) return err; fpregs_lock_and_load(); wrmsrl(MSR_IA32_PL3_SSP, ssp); fpregs_unlock(); return 0; } void shstk_free(struct task_struct *tsk) { struct thread_shstk *shstk = &tsk->thread.shstk; if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || !features_enabled(ARCH_SHSTK_SHSTK)) return; /* * When fork() with CLONE_VM fails, the child (tsk) already has a * shadow stack allocated, and exit_thread() calls this function to * free it. In this case the parent (current) and the child share * the same mm struct. */ if (!tsk->mm || tsk->mm != current->mm) return; /* * If shstk->base is NULL, then this task is not managing its * own shadow stack (CLONE_VFORK). So skip freeing it. */ if (!shstk->base) return; /* * shstk->base is NULL for CLONE_VFORK child tasks, and so is * normal. But size = 0 on a shstk->base is not normal and * indicated an attempt to free the thread shadow stack twice. * Warn about it. */ if (WARN_ON(!shstk->size)) return; unmap_shadow_stack(shstk->base, shstk->size); shstk->size = 0; } static int wrss_control(bool enable) { u64 msrval; if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) return -EOPNOTSUPP; /* * Only enable WRSS if shadow stack is enabled. If shadow stack is not * enabled, WRSS will already be disabled, so don't bother clearing it * when disabling. */ if (!features_enabled(ARCH_SHSTK_SHSTK)) return -EPERM; /* Already enabled/disabled? */ if (features_enabled(ARCH_SHSTK_WRSS) == enable) return 0; fpregs_lock_and_load(); rdmsrl(MSR_IA32_U_CET, msrval); if (enable) { features_set(ARCH_SHSTK_WRSS); msrval |= CET_WRSS_EN; } else { features_clr(ARCH_SHSTK_WRSS); if (!(msrval & CET_WRSS_EN)) goto unlock; msrval &= ~CET_WRSS_EN; } wrmsrl(MSR_IA32_U_CET, msrval); unlock: fpregs_unlock(); return 0; } static int shstk_disable(void) { if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) return -EOPNOTSUPP; /* Already disabled? */ if (!features_enabled(ARCH_SHSTK_SHSTK)) return 0; fpregs_lock_and_load(); /* Disable WRSS too when disabling shadow stack */ wrmsrl(MSR_IA32_U_CET, 0); wrmsrl(MSR_IA32_PL3_SSP, 0); fpregs_unlock(); shstk_free(current); features_clr(ARCH_SHSTK_SHSTK | ARCH_SHSTK_WRSS); return 0; } SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsigned int, flags) { bool set_tok = flags & SHADOW_STACK_SET_TOKEN; unsigned long aligned_size; if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) return -EOPNOTSUPP; if (flags & ~SHADOW_STACK_SET_TOKEN) return -EINVAL; /* If there isn't space for a token */ if (set_tok && size < 8) return -ENOSPC; if (addr && addr < SZ_4G) return -ERANGE; /* * An overflow would result in attempting to write the restore token * to the wrong location. Not catastrophic, but just return the right * error code and block it. */ aligned_size = PAGE_ALIGN(size); if (aligned_size < size) return -EOVERFLOW; return alloc_shstk(addr, aligned_size, size, set_tok); } long shstk_prctl(struct task_struct *task, int option, unsigned long arg2) { unsigned long features = arg2; if (option == ARCH_SHSTK_STATUS) { return put_user(task->thread.features, (unsigned long __user *)arg2); } if (option == ARCH_SHSTK_LOCK) { task->thread.features_locked |= features; return 0; } /* Only allow via ptrace */ if (task != current) { if (option == ARCH_SHSTK_UNLOCK && IS_ENABLED(CONFIG_CHECKPOINT_RESTORE)) { task->thread.features_locked &= ~features; return 0; } return -EINVAL; } /* Do not allow to change locked features */ if (features & task->thread.features_locked) return -EPERM; /* Only support enabling/disabling one feature at a time. */ if (hweight_long(features) > 1) return -EINVAL; if (option == ARCH_SHSTK_DISABLE) { if (features & ARCH_SHSTK_WRSS) return wrss_control(false); if (features & ARCH_SHSTK_SHSTK) return shstk_disable(); return -EINVAL; } /* Handle ARCH_SHSTK_ENABLE */ if (features & ARCH_SHSTK_SHSTK) return shstk_setup(); if (features & ARCH_SHSTK_WRSS) return wrss_control(true); return -EINVAL; } |
3609 1032 5886 256 256 256 4390 8474 6978 4417 2928 2928 6979 5886 6976 993 4417 4417 4417 4417 6975 6978 6973 6978 6975 4390 4390 4390 2011 3815 4390 4390 1218 1218 1217 1218 1218 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 | // SPDX-License-Identifier: GPL-2.0 /* * security/tomoyo/file.c * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #include "common.h" #include <linux/slab.h> /* * Mapping table from "enum tomoyo_path_acl_index" to "enum tomoyo_mac_index". */ static const u8 tomoyo_p2mac[TOMOYO_MAX_PATH_OPERATION] = { [TOMOYO_TYPE_EXECUTE] = TOMOYO_MAC_FILE_EXECUTE, [TOMOYO_TYPE_READ] = TOMOYO_MAC_FILE_OPEN, [TOMOYO_TYPE_WRITE] = TOMOYO_MAC_FILE_OPEN, [TOMOYO_TYPE_APPEND] = TOMOYO_MAC_FILE_OPEN, [TOMOYO_TYPE_UNLINK] = TOMOYO_MAC_FILE_UNLINK, [TOMOYO_TYPE_GETATTR] = TOMOYO_MAC_FILE_GETATTR, [TOMOYO_TYPE_RMDIR] = TOMOYO_MAC_FILE_RMDIR, [TOMOYO_TYPE_TRUNCATE] = TOMOYO_MAC_FILE_TRUNCATE, [TOMOYO_TYPE_SYMLINK] = TOMOYO_MAC_FILE_SYMLINK, [TOMOYO_TYPE_CHROOT] = TOMOYO_MAC_FILE_CHROOT, [TOMOYO_TYPE_UMOUNT] = TOMOYO_MAC_FILE_UMOUNT, }; /* * Mapping table from "enum tomoyo_mkdev_acl_index" to "enum tomoyo_mac_index". */ const u8 tomoyo_pnnn2mac[TOMOYO_MAX_MKDEV_OPERATION] = { [TOMOYO_TYPE_MKBLOCK] = TOMOYO_MAC_FILE_MKBLOCK, [TOMOYO_TYPE_MKCHAR] = TOMOYO_MAC_FILE_MKCHAR, }; /* * Mapping table from "enum tomoyo_path2_acl_index" to "enum tomoyo_mac_index". */ const u8 tomoyo_pp2mac[TOMOYO_MAX_PATH2_OPERATION] = { [TOMOYO_TYPE_LINK] = TOMOYO_MAC_FILE_LINK, [TOMOYO_TYPE_RENAME] = TOMOYO_MAC_FILE_RENAME, [TOMOYO_TYPE_PIVOT_ROOT] = TOMOYO_MAC_FILE_PIVOT_ROOT, }; /* * Mapping table from "enum tomoyo_path_number_acl_index" to * "enum tomoyo_mac_index". */ const u8 tomoyo_pn2mac[TOMOYO_MAX_PATH_NUMBER_OPERATION] = { [TOMOYO_TYPE_CREATE] = TOMOYO_MAC_FILE_CREATE, [TOMOYO_TYPE_MKDIR] = TOMOYO_MAC_FILE_MKDIR, [TOMOYO_TYPE_MKFIFO] = TOMOYO_MAC_FILE_MKFIFO, [TOMOYO_TYPE_MKSOCK] = TOMOYO_MAC_FILE_MKSOCK, [TOMOYO_TYPE_IOCTL] = TOMOYO_MAC_FILE_IOCTL, [TOMOYO_TYPE_CHMOD] = TOMOYO_MAC_FILE_CHMOD, [TOMOYO_TYPE_CHOWN] = TOMOYO_MAC_FILE_CHOWN, [TOMOYO_TYPE_CHGRP] = TOMOYO_MAC_FILE_CHGRP, }; /** * tomoyo_put_name_union - Drop reference on "struct tomoyo_name_union". * * @ptr: Pointer to "struct tomoyo_name_union". * * Returns nothing. */ void tomoyo_put_name_union(struct tomoyo_name_union *ptr) { tomoyo_put_group(ptr->group); tomoyo_put_name(ptr->filename); } /** * tomoyo_compare_name_union - Check whether a name matches "struct tomoyo_name_union" or not. * * @name: Pointer to "struct tomoyo_path_info". * @ptr: Pointer to "struct tomoyo_name_union". * * Returns "struct tomoyo_path_info" if @name matches @ptr, NULL otherwise. */ const struct tomoyo_path_info * tomoyo_compare_name_union(const struct tomoyo_path_info *name, const struct tomoyo_name_union *ptr) { if (ptr->group) return tomoyo_path_matches_group(name, ptr->group); if (tomoyo_path_matches_pattern(name, ptr->filename)) return ptr->filename; return NULL; } /** * tomoyo_put_number_union - Drop reference on "struct tomoyo_number_union". * * @ptr: Pointer to "struct tomoyo_number_union". * * Returns nothing. */ void tomoyo_put_number_union(struct tomoyo_number_union *ptr) { tomoyo_put_group(ptr->group); } /** * tomoyo_compare_number_union - Check whether a value matches "struct tomoyo_number_union" or not. * * @value: Number to check. * @ptr: Pointer to "struct tomoyo_number_union". * * Returns true if @value matches @ptr, false otherwise. */ bool tomoyo_compare_number_union(const unsigned long value, const struct tomoyo_number_union *ptr) { if (ptr->group) return tomoyo_number_matches_group(value, value, ptr->group); return value >= ptr->values[0] && value <= ptr->values[1]; } /** * tomoyo_add_slash - Add trailing '/' if needed. * * @buf: Pointer to "struct tomoyo_path_info". * * Returns nothing. * * @buf must be generated by tomoyo_encode() because this function does not * allocate memory for adding '/'. */ static void tomoyo_add_slash(struct tomoyo_path_info *buf) { if (buf->is_dir) return; /* * This is OK because tomoyo_encode() reserves space for appending "/". */ strcat((char *) buf->name, "/"); tomoyo_fill_path_info(buf); } /** * tomoyo_get_realpath - Get realpath. * * @buf: Pointer to "struct tomoyo_path_info". * @path: Pointer to "struct path". * * Returns true on success, false otherwise. */ static bool tomoyo_get_realpath(struct tomoyo_path_info *buf, const struct path *path) { buf->name = tomoyo_realpath_from_path(path); if (buf->name) { tomoyo_fill_path_info(buf); return true; } return false; } /** * tomoyo_audit_path_log - Audit path request log. * * @r: Pointer to "struct tomoyo_request_info". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_audit_path_log(struct tomoyo_request_info *r) { return tomoyo_supervisor(r, "file %s %s\n", tomoyo_path_keyword [r->param.path.operation], r->param.path.filename->name); } /** * tomoyo_audit_path2_log - Audit path/path request log. * * @r: Pointer to "struct tomoyo_request_info". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_audit_path2_log(struct tomoyo_request_info *r) { return tomoyo_supervisor(r, "file %s %s %s\n", tomoyo_mac_keywords [tomoyo_pp2mac[r->param.path2.operation]], r->param.path2.filename1->name, r->param.path2.filename2->name); } /** * tomoyo_audit_mkdev_log - Audit path/number/number/number request log. * * @r: Pointer to "struct tomoyo_request_info". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_audit_mkdev_log(struct tomoyo_request_info *r) { return tomoyo_supervisor(r, "file %s %s 0%o %u %u\n", tomoyo_mac_keywords [tomoyo_pnnn2mac[r->param.mkdev.operation]], r->param.mkdev.filename->name, r->param.mkdev.mode, r->param.mkdev.major, r->param.mkdev.minor); } /** * tomoyo_audit_path_number_log - Audit path/number request log. * * @r: Pointer to "struct tomoyo_request_info". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_audit_path_number_log(struct tomoyo_request_info *r) { const u8 type = r->param.path_number.operation; u8 radix; char buffer[64]; switch (type) { case TOMOYO_TYPE_CREATE: case TOMOYO_TYPE_MKDIR: case TOMOYO_TYPE_MKFIFO: case TOMOYO_TYPE_MKSOCK: case TOMOYO_TYPE_CHMOD: radix = TOMOYO_VALUE_TYPE_OCTAL; break; case TOMOYO_TYPE_IOCTL: radix = TOMOYO_VALUE_TYPE_HEXADECIMAL; break; default: radix = TOMOYO_VALUE_TYPE_DECIMAL; break; } tomoyo_print_ulong(buffer, sizeof(buffer), r->param.path_number.number, radix); return tomoyo_supervisor(r, "file %s %s %s\n", tomoyo_mac_keywords [tomoyo_pn2mac[type]], r->param.path_number.filename->name, buffer); } /** * tomoyo_check_path_acl - Check permission for path operation. * * @r: Pointer to "struct tomoyo_request_info". * @ptr: Pointer to "struct tomoyo_acl_info". * * Returns true if granted, false otherwise. * * To be able to use wildcard for domain transition, this function sets * matching entry on success. Since the caller holds tomoyo_read_lock(), * it is safe to set matching entry. */ static bool tomoyo_check_path_acl(struct tomoyo_request_info *r, const struct tomoyo_acl_info *ptr) { const struct tomoyo_path_acl *acl = container_of(ptr, typeof(*acl), head); if (acl->perm & (1 << r->param.path.operation)) { r->param.path.matched_path = tomoyo_compare_name_union(r->param.path.filename, &acl->name); return r->param.path.matched_path != NULL; } return false; } /** * tomoyo_check_path_number_acl - Check permission for path number operation. * * @r: Pointer to "struct tomoyo_request_info". * @ptr: Pointer to "struct tomoyo_acl_info". * * Returns true if granted, false otherwise. */ static bool tomoyo_check_path_number_acl(struct tomoyo_request_info *r, const struct tomoyo_acl_info *ptr) { const struct tomoyo_path_number_acl *acl = container_of(ptr, typeof(*acl), head); return (acl->perm & (1 << r->param.path_number.operation)) && tomoyo_compare_number_union(r->param.path_number.number, &acl->number) && tomoyo_compare_name_union(r->param.path_number.filename, &acl->name); } /** * tomoyo_check_path2_acl - Check permission for path path operation. * * @r: Pointer to "struct tomoyo_request_info". * @ptr: Pointer to "struct tomoyo_acl_info". * * Returns true if granted, false otherwise. */ static bool tomoyo_check_path2_acl(struct tomoyo_request_info *r, const struct tomoyo_acl_info *ptr) { const struct tomoyo_path2_acl *acl = container_of(ptr, typeof(*acl), head); return (acl->perm & (1 << r->param.path2.operation)) && tomoyo_compare_name_union(r->param.path2.filename1, &acl->name1) && tomoyo_compare_name_union(r->param.path2.filename2, &acl->name2); } /** * tomoyo_check_mkdev_acl - Check permission for path number number number operation. * * @r: Pointer to "struct tomoyo_request_info". * @ptr: Pointer to "struct tomoyo_acl_info". * * Returns true if granted, false otherwise. */ static bool tomoyo_check_mkdev_acl(struct tomoyo_request_info *r, const struct tomoyo_acl_info *ptr) { const struct tomoyo_mkdev_acl *acl = container_of(ptr, typeof(*acl), head); return (acl->perm & (1 << r->param.mkdev.operation)) && tomoyo_compare_number_union(r->param.mkdev.mode, &acl->mode) && tomoyo_compare_number_union(r->param.mkdev.major, &acl->major) && tomoyo_compare_number_union(r->param.mkdev.minor, &acl->minor) && tomoyo_compare_name_union(r->param.mkdev.filename, &acl->name); } /** * tomoyo_same_path_acl - Check for duplicated "struct tomoyo_path_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * * Returns true if @a == @b except permission bits, false otherwise. */ static bool tomoyo_same_path_acl(const struct tomoyo_acl_info *a, const struct tomoyo_acl_info *b) { const struct tomoyo_path_acl *p1 = container_of(a, typeof(*p1), head); const struct tomoyo_path_acl *p2 = container_of(b, typeof(*p2), head); return tomoyo_same_name_union(&p1->name, &p2->name); } /** * tomoyo_merge_path_acl - Merge duplicated "struct tomoyo_path_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * @is_delete: True for @a &= ~@b, false for @a |= @b. * * Returns true if @a is empty, false otherwise. */ static bool tomoyo_merge_path_acl(struct tomoyo_acl_info *a, struct tomoyo_acl_info *b, const bool is_delete) { u16 * const a_perm = &container_of(a, struct tomoyo_path_acl, head) ->perm; u16 perm = READ_ONCE(*a_perm); const u16 b_perm = container_of(b, struct tomoyo_path_acl, head)->perm; if (is_delete) perm &= ~b_perm; else perm |= b_perm; WRITE_ONCE(*a_perm, perm); return !perm; } /** * tomoyo_update_path_acl - Update "struct tomoyo_path_acl" list. * * @perm: Permission. * @param: Pointer to "struct tomoyo_acl_param". * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_update_path_acl(const u16 perm, struct tomoyo_acl_param *param) { struct tomoyo_path_acl e = { .head.type = TOMOYO_TYPE_PATH_ACL, .perm = perm }; int error; if (!tomoyo_parse_name_union(param, &e.name)) error = -EINVAL; else error = tomoyo_update_domain(&e.head, sizeof(e), param, tomoyo_same_path_acl, tomoyo_merge_path_acl); tomoyo_put_name_union(&e.name); return error; } /** * tomoyo_same_mkdev_acl - Check for duplicated "struct tomoyo_mkdev_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * * Returns true if @a == @b except permission bits, false otherwise. */ static bool tomoyo_same_mkdev_acl(const struct tomoyo_acl_info *a, const struct tomoyo_acl_info *b) { const struct tomoyo_mkdev_acl *p1 = container_of(a, typeof(*p1), head); const struct tomoyo_mkdev_acl *p2 = container_of(b, typeof(*p2), head); return tomoyo_same_name_union(&p1->name, &p2->name) && tomoyo_same_number_union(&p1->mode, &p2->mode) && tomoyo_same_number_union(&p1->major, &p2->major) && tomoyo_same_number_union(&p1->minor, &p2->minor); } /** * tomoyo_merge_mkdev_acl - Merge duplicated "struct tomoyo_mkdev_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * @is_delete: True for @a &= ~@b, false for @a |= @b. * * Returns true if @a is empty, false otherwise. */ static bool tomoyo_merge_mkdev_acl(struct tomoyo_acl_info *a, struct tomoyo_acl_info *b, const bool is_delete) { u8 *const a_perm = &container_of(a, struct tomoyo_mkdev_acl, head)->perm; u8 perm = READ_ONCE(*a_perm); const u8 b_perm = container_of(b, struct tomoyo_mkdev_acl, head) ->perm; if (is_delete) perm &= ~b_perm; else perm |= b_perm; WRITE_ONCE(*a_perm, perm); return !perm; } /** * tomoyo_update_mkdev_acl - Update "struct tomoyo_mkdev_acl" list. * * @perm: Permission. * @param: Pointer to "struct tomoyo_acl_param". * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_update_mkdev_acl(const u8 perm, struct tomoyo_acl_param *param) { struct tomoyo_mkdev_acl e = { .head.type = TOMOYO_TYPE_MKDEV_ACL, .perm = perm }; int error; if (!tomoyo_parse_name_union(param, &e.name) || !tomoyo_parse_number_union(param, &e.mode) || !tomoyo_parse_number_union(param, &e.major) || !tomoyo_parse_number_union(param, &e.minor)) error = -EINVAL; else error = tomoyo_update_domain(&e.head, sizeof(e), param, tomoyo_same_mkdev_acl, tomoyo_merge_mkdev_acl); tomoyo_put_name_union(&e.name); tomoyo_put_number_union(&e.mode); tomoyo_put_number_union(&e.major); tomoyo_put_number_union(&e.minor); return error; } /** * tomoyo_same_path2_acl - Check for duplicated "struct tomoyo_path2_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * * Returns true if @a == @b except permission bits, false otherwise. */ static bool tomoyo_same_path2_acl(const struct tomoyo_acl_info *a, const struct tomoyo_acl_info *b) { const struct tomoyo_path2_acl *p1 = container_of(a, typeof(*p1), head); const struct tomoyo_path2_acl *p2 = container_of(b, typeof(*p2), head); return tomoyo_same_name_union(&p1->name1, &p2->name1) && tomoyo_same_name_union(&p1->name2, &p2->name2); } /** * tomoyo_merge_path2_acl - Merge duplicated "struct tomoyo_path2_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * @is_delete: True for @a &= ~@b, false for @a |= @b. * * Returns true if @a is empty, false otherwise. */ static bool tomoyo_merge_path2_acl(struct tomoyo_acl_info *a, struct tomoyo_acl_info *b, const bool is_delete) { u8 * const a_perm = &container_of(a, struct tomoyo_path2_acl, head) ->perm; u8 perm = READ_ONCE(*a_perm); const u8 b_perm = container_of(b, struct tomoyo_path2_acl, head)->perm; if (is_delete) perm &= ~b_perm; else perm |= b_perm; WRITE_ONCE(*a_perm, perm); return !perm; } /** * tomoyo_update_path2_acl - Update "struct tomoyo_path2_acl" list. * * @perm: Permission. * @param: Pointer to "struct tomoyo_acl_param". * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_update_path2_acl(const u8 perm, struct tomoyo_acl_param *param) { struct tomoyo_path2_acl e = { .head.type = TOMOYO_TYPE_PATH2_ACL, .perm = perm }; int error; if (!tomoyo_parse_name_union(param, &e.name1) || !tomoyo_parse_name_union(param, &e.name2)) error = -EINVAL; else error = tomoyo_update_domain(&e.head, sizeof(e), param, tomoyo_same_path2_acl, tomoyo_merge_path2_acl); tomoyo_put_name_union(&e.name1); tomoyo_put_name_union(&e.name2); return error; } /** * tomoyo_path_permission - Check permission for single path operation. * * @r: Pointer to "struct tomoyo_request_info". * @operation: Type of operation. * @filename: Filename to check. * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_path_permission(struct tomoyo_request_info *r, u8 operation, const struct tomoyo_path_info *filename) { int error; r->type = tomoyo_p2mac[operation]; r->mode = tomoyo_get_mode(r->domain->ns, r->profile, r->type); if (r->mode == TOMOYO_CONFIG_DISABLED) return 0; r->param_type = TOMOYO_TYPE_PATH_ACL; r->param.path.filename = filename; r->param.path.operation = operation; do { tomoyo_check_acl(r, tomoyo_check_path_acl); error = tomoyo_audit_path_log(r); } while (error == TOMOYO_RETRY_REQUEST); return error; } /** * tomoyo_execute_permission - Check permission for execute operation. * * @r: Pointer to "struct tomoyo_request_info". * @filename: Filename to check. * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ int tomoyo_execute_permission(struct tomoyo_request_info *r, const struct tomoyo_path_info *filename) { /* * Unlike other permission checks, this check is done regardless of * profile mode settings in order to check for domain transition * preference. */ r->type = TOMOYO_MAC_FILE_EXECUTE; r->mode = tomoyo_get_mode(r->domain->ns, r->profile, r->type); r->param_type = TOMOYO_TYPE_PATH_ACL; r->param.path.filename = filename; r->param.path.operation = TOMOYO_TYPE_EXECUTE; tomoyo_check_acl(r, tomoyo_check_path_acl); r->ee->transition = r->matched_acl && r->matched_acl->cond ? r->matched_acl->cond->transit : NULL; if (r->mode != TOMOYO_CONFIG_DISABLED) return tomoyo_audit_path_log(r); return 0; } /** * tomoyo_same_path_number_acl - Check for duplicated "struct tomoyo_path_number_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * * Returns true if @a == @b except permission bits, false otherwise. */ static bool tomoyo_same_path_number_acl(const struct tomoyo_acl_info *a, const struct tomoyo_acl_info *b) { const struct tomoyo_path_number_acl *p1 = container_of(a, typeof(*p1), head); const struct tomoyo_path_number_acl *p2 = container_of(b, typeof(*p2), head); return tomoyo_same_name_union(&p1->name, &p2->name) && tomoyo_same_number_union(&p1->number, &p2->number); } /** * tomoyo_merge_path_number_acl - Merge duplicated "struct tomoyo_path_number_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * @is_delete: True for @a &= ~@b, false for @a |= @b. * * Returns true if @a is empty, false otherwise. */ static bool tomoyo_merge_path_number_acl(struct tomoyo_acl_info *a, struct tomoyo_acl_info *b, const bool is_delete) { u8 * const a_perm = &container_of(a, struct tomoyo_path_number_acl, head)->perm; u8 perm = READ_ONCE(*a_perm); const u8 b_perm = container_of(b, struct tomoyo_path_number_acl, head) ->perm; if (is_delete) perm &= ~b_perm; else perm |= b_perm; WRITE_ONCE(*a_perm, perm); return !perm; } /** * tomoyo_update_path_number_acl - Update ioctl/chmod/chown/chgrp ACL. * * @perm: Permission. * @param: Pointer to "struct tomoyo_acl_param". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_update_path_number_acl(const u8 perm, struct tomoyo_acl_param *param) { struct tomoyo_path_number_acl e = { .head.type = TOMOYO_TYPE_PATH_NUMBER_ACL, .perm = perm }; int error; if (!tomoyo_parse_name_union(param, &e.name) || !tomoyo_parse_number_union(param, &e.number)) error = -EINVAL; else error = tomoyo_update_domain(&e.head, sizeof(e), param, tomoyo_same_path_number_acl, tomoyo_merge_path_number_acl); tomoyo_put_name_union(&e.name); tomoyo_put_number_union(&e.number); return error; } /** * tomoyo_path_number_perm - Check permission for "create", "mkdir", "mkfifo", "mksock", "ioctl", "chmod", "chown", "chgrp". * * @type: Type of operation. * @path: Pointer to "struct path". * @number: Number. * * Returns 0 on success, negative value otherwise. */ int tomoyo_path_number_perm(const u8 type, const struct path *path, unsigned long number) { struct tomoyo_request_info r; struct tomoyo_obj_info obj = { .path1 = { .mnt = path->mnt, .dentry = path->dentry }, }; int error = -ENOMEM; struct tomoyo_path_info buf; int idx; if (tomoyo_init_request_info(&r, NULL, tomoyo_pn2mac[type]) == TOMOYO_CONFIG_DISABLED) return 0; idx = tomoyo_read_lock(); if (!tomoyo_get_realpath(&buf, path)) goto out; r.obj = &obj; if (type == TOMOYO_TYPE_MKDIR) tomoyo_add_slash(&buf); r.param_type = TOMOYO_TYPE_PATH_NUMBER_ACL; r.param.path_number.operation = type; r.param.path_number.filename = &buf; r.param.path_number.number = number; do { tomoyo_check_acl(&r, tomoyo_check_path_number_acl); error = tomoyo_audit_path_number_log(&r); } while (error == TOMOYO_RETRY_REQUEST); kfree(buf.name); out: tomoyo_read_unlock(idx); if (r.mode != TOMOYO_CONFIG_ENFORCING) error = 0; return error; } /** * tomoyo_check_open_permission - Check permission for "read" and "write". * * @domain: Pointer to "struct tomoyo_domain_info". * @path: Pointer to "struct path". * @flag: Flags for open(). * * Returns 0 on success, negative value otherwise. */ int tomoyo_check_open_permission(struct tomoyo_domain_info *domain, const struct path *path, const int flag) { const u8 acc_mode = ACC_MODE(flag); int error = 0; struct tomoyo_path_info buf; struct tomoyo_request_info r; struct tomoyo_obj_info obj = { .path1 = { .mnt = path->mnt, .dentry = path->dentry }, }; int idx; buf.name = NULL; r.mode = TOMOYO_CONFIG_DISABLED; idx = tomoyo_read_lock(); if (acc_mode && tomoyo_init_request_info(&r, domain, TOMOYO_MAC_FILE_OPEN) != TOMOYO_CONFIG_DISABLED) { if (!tomoyo_get_realpath(&buf, path)) { error = -ENOMEM; goto out; } r.obj = &obj; if (acc_mode & MAY_READ) error = tomoyo_path_permission(&r, TOMOYO_TYPE_READ, &buf); if (!error && (acc_mode & MAY_WRITE)) error = tomoyo_path_permission(&r, (flag & O_APPEND) ? TOMOYO_TYPE_APPEND : TOMOYO_TYPE_WRITE, &buf); } out: kfree(buf.name); tomoyo_read_unlock(idx); if (r.mode != TOMOYO_CONFIG_ENFORCING) error = 0; return error; } /** * tomoyo_path_perm - Check permission for "unlink", "rmdir", "truncate", "symlink", "append", "chroot" and "unmount". * * @operation: Type of operation. * @path: Pointer to "struct path". * @target: Symlink's target if @operation is TOMOYO_TYPE_SYMLINK, * NULL otherwise. * * Returns 0 on success, negative value otherwise. */ int tomoyo_path_perm(const u8 operation, const struct path *path, const char *target) { struct tomoyo_request_info r; struct tomoyo_obj_info obj = { .path1 = { .mnt = path->mnt, .dentry = path->dentry }, }; int error; struct tomoyo_path_info buf; bool is_enforce; struct tomoyo_path_info symlink_target; int idx; if (tomoyo_init_request_info(&r, NULL, tomoyo_p2mac[operation]) == TOMOYO_CONFIG_DISABLED) return 0; is_enforce = (r.mode == TOMOYO_CONFIG_ENFORCING); error = -ENOMEM; buf.name = NULL; idx = tomoyo_read_lock(); if (!tomoyo_get_realpath(&buf, path)) goto out; r.obj = &obj; switch (operation) { case TOMOYO_TYPE_RMDIR: case TOMOYO_TYPE_CHROOT: tomoyo_add_slash(&buf); break; case TOMOYO_TYPE_SYMLINK: symlink_target.name = tomoyo_encode(target); if (!symlink_target.name) goto out; tomoyo_fill_path_info(&symlink_target); obj.symlink_target = &symlink_target; break; } error = tomoyo_path_permission(&r, operation, &buf); if (operation == TOMOYO_TYPE_SYMLINK) kfree(symlink_target.name); out: kfree(buf.name); tomoyo_read_unlock(idx); if (!is_enforce) error = 0; return error; } /** * tomoyo_mkdev_perm - Check permission for "mkblock" and "mkchar". * * @operation: Type of operation. (TOMOYO_TYPE_MKCHAR or TOMOYO_TYPE_MKBLOCK) * @path: Pointer to "struct path". * @mode: Create mode. * @dev: Device number. * * Returns 0 on success, negative value otherwise. */ int tomoyo_mkdev_perm(const u8 operation, const struct path *path, const unsigned int mode, unsigned int dev) { struct tomoyo_request_info r; struct tomoyo_obj_info obj = { .path1 = { .mnt = path->mnt, .dentry = path->dentry }, }; int error = -ENOMEM; struct tomoyo_path_info buf; int idx; if (tomoyo_init_request_info(&r, NULL, tomoyo_pnnn2mac[operation]) == TOMOYO_CONFIG_DISABLED) return 0; idx = tomoyo_read_lock(); error = -ENOMEM; if (tomoyo_get_realpath(&buf, path)) { r.obj = &obj; dev = new_decode_dev(dev); r.param_type = TOMOYO_TYPE_MKDEV_ACL; r.param.mkdev.filename = &buf; r.param.mkdev.operation = operation; r.param.mkdev.mode = mode; r.param.mkdev.major = MAJOR(dev); r.param.mkdev.minor = MINOR(dev); tomoyo_check_acl(&r, tomoyo_check_mkdev_acl); error = tomoyo_audit_mkdev_log(&r); kfree(buf.name); } tomoyo_read_unlock(idx); if (r.mode != TOMOYO_CONFIG_ENFORCING) error = 0; return error; } /** * tomoyo_path2_perm - Check permission for "rename", "link" and "pivot_root". * * @operation: Type of operation. * @path1: Pointer to "struct path". * @path2: Pointer to "struct path". * * Returns 0 on success, negative value otherwise. */ int tomoyo_path2_perm(const u8 operation, const struct path *path1, const struct path *path2) { int error = -ENOMEM; struct tomoyo_path_info buf1; struct tomoyo_path_info buf2; struct tomoyo_request_info r; struct tomoyo_obj_info obj = { .path1 = { .mnt = path1->mnt, .dentry = path1->dentry }, .path2 = { .mnt = path2->mnt, .dentry = path2->dentry } }; int idx; if (tomoyo_init_request_info(&r, NULL, tomoyo_pp2mac[operation]) == TOMOYO_CONFIG_DISABLED) return 0; buf1.name = NULL; buf2.name = NULL; idx = tomoyo_read_lock(); if (!tomoyo_get_realpath(&buf1, path1) || !tomoyo_get_realpath(&buf2, path2)) goto out; switch (operation) { case TOMOYO_TYPE_RENAME: case TOMOYO_TYPE_LINK: if (!d_is_dir(path1->dentry)) break; fallthrough; case TOMOYO_TYPE_PIVOT_ROOT: tomoyo_add_slash(&buf1); tomoyo_add_slash(&buf2); break; } r.obj = &obj; r.param_type = TOMOYO_TYPE_PATH2_ACL; r.param.path2.operation = operation; r.param.path2.filename1 = &buf1; r.param.path2.filename2 = &buf2; do { tomoyo_check_acl(&r, tomoyo_check_path2_acl); error = tomoyo_audit_path2_log(&r); } while (error == TOMOYO_RETRY_REQUEST); out: kfree(buf1.name); kfree(buf2.name); tomoyo_read_unlock(idx); if (r.mode != TOMOYO_CONFIG_ENFORCING) error = 0; return error; } /** * tomoyo_same_mount_acl - Check for duplicated "struct tomoyo_mount_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * * Returns true if @a == @b, false otherwise. */ static bool tomoyo_same_mount_acl(const struct tomoyo_acl_info *a, const struct tomoyo_acl_info *b) { const struct tomoyo_mount_acl *p1 = container_of(a, typeof(*p1), head); const struct tomoyo_mount_acl *p2 = container_of(b, typeof(*p2), head); return tomoyo_same_name_union(&p1->dev_name, &p2->dev_name) && tomoyo_same_name_union(&p1->dir_name, &p2->dir_name) && tomoyo_same_name_union(&p1->fs_type, &p2->fs_type) && tomoyo_same_number_union(&p1->flags, &p2->flags); } /** * tomoyo_update_mount_acl - Write "struct tomoyo_mount_acl" list. * * @param: Pointer to "struct tomoyo_acl_param". * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_update_mount_acl(struct tomoyo_acl_param *param) { struct tomoyo_mount_acl e = { .head.type = TOMOYO_TYPE_MOUNT_ACL }; int error; if (!tomoyo_parse_name_union(param, &e.dev_name) || !tomoyo_parse_name_union(param, &e.dir_name) || !tomoyo_parse_name_union(param, &e.fs_type) || !tomoyo_parse_number_union(param, &e.flags)) error = -EINVAL; else error = tomoyo_update_domain(&e.head, sizeof(e), param, tomoyo_same_mount_acl, NULL); tomoyo_put_name_union(&e.dev_name); tomoyo_put_name_union(&e.dir_name); tomoyo_put_name_union(&e.fs_type); tomoyo_put_number_union(&e.flags); return error; } /** * tomoyo_write_file - Update file related list. * * @param: Pointer to "struct tomoyo_acl_param". * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ int tomoyo_write_file(struct tomoyo_acl_param *param) { u16 perm = 0; u8 type; const char *operation = tomoyo_read_token(param); for (type = 0; type < TOMOYO_MAX_PATH_OPERATION; type++) if (tomoyo_permstr(operation, tomoyo_path_keyword[type])) perm |= 1 << type; if (perm) return tomoyo_update_path_acl(perm, param); for (type = 0; type < TOMOYO_MAX_PATH2_OPERATION; type++) if (tomoyo_permstr(operation, tomoyo_mac_keywords[tomoyo_pp2mac[type]])) perm |= 1 << type; if (perm) return tomoyo_update_path2_acl(perm, param); for (type = 0; type < TOMOYO_MAX_PATH_NUMBER_OPERATION; type++) if (tomoyo_permstr(operation, tomoyo_mac_keywords[tomoyo_pn2mac[type]])) perm |= 1 << type; if (perm) return tomoyo_update_path_number_acl(perm, param); for (type = 0; type < TOMOYO_MAX_MKDEV_OPERATION; type++) if (tomoyo_permstr(operation, tomoyo_mac_keywords[tomoyo_pnnn2mac[type]])) perm |= 1 << type; if (perm) return tomoyo_update_mkdev_acl(perm, param); if (tomoyo_permstr(operation, tomoyo_mac_keywords[TOMOYO_MAC_FILE_MOUNT])) return tomoyo_update_mount_acl(param); return -EINVAL; } |
5909 1244 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 | /* SPDX-License-Identifier: GPL-2.0 */ /* * include/linux/userfaultfd_k.h * * Copyright (C) 2015 Red Hat, Inc. * */ #ifndef _LINUX_USERFAULTFD_K_H #define _LINUX_USERFAULTFD_K_H #ifdef CONFIG_USERFAULTFD #include <linux/userfaultfd.h> /* linux/include/uapi/linux/userfaultfd.h */ #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/swapops.h> #include <asm-generic/pgtable_uffd.h> #include <linux/hugetlb_inline.h> /* The set of all possible UFFD-related VM flags. */ #define __VM_UFFD_FLAGS (VM_UFFD_MISSING | VM_UFFD_WP | VM_UFFD_MINOR) /* * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining * new flags, since they might collide with O_* ones. We want * to re-use O_* flags that couldn't possibly have a meaning * from userfaultfd, in order to leave a free define-space for * shared O_* flags. */ #define UFFD_CLOEXEC O_CLOEXEC #define UFFD_NONBLOCK O_NONBLOCK #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS) extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason); /* A combined operation mode + behavior flags. */ typedef unsigned int __bitwise uffd_flags_t; /* Mutually exclusive modes of operation. */ enum mfill_atomic_mode { MFILL_ATOMIC_COPY, MFILL_ATOMIC_ZEROPAGE, MFILL_ATOMIC_CONTINUE, MFILL_ATOMIC_POISON, NR_MFILL_ATOMIC_MODES, }; #define MFILL_ATOMIC_MODE_BITS (const_ilog2(NR_MFILL_ATOMIC_MODES - 1) + 1) #define MFILL_ATOMIC_BIT(nr) BIT(MFILL_ATOMIC_MODE_BITS + (nr)) #define MFILL_ATOMIC_FLAG(nr) ((__force uffd_flags_t) MFILL_ATOMIC_BIT(nr)) #define MFILL_ATOMIC_MODE_MASK ((__force uffd_flags_t) (MFILL_ATOMIC_BIT(0) - 1)) static inline bool uffd_flags_mode_is(uffd_flags_t flags, enum mfill_atomic_mode expected) { return (flags & MFILL_ATOMIC_MODE_MASK) == ((__force uffd_flags_t) expected); } static inline uffd_flags_t uffd_flags_set_mode(uffd_flags_t flags, enum mfill_atomic_mode mode) { flags &= ~MFILL_ATOMIC_MODE_MASK; return flags | ((__force uffd_flags_t) mode); } /* Flags controlling behavior. These behavior changes are mode-independent. */ #define MFILL_ATOMIC_WP MFILL_ATOMIC_FLAG(0) extern int mfill_atomic_install_pte(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, struct page *page, bool newly_allocated, uffd_flags_t flags); extern ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, atomic_t *mmap_changing, uffd_flags_t flags); extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long len, atomic_t *mmap_changing); extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long len, atomic_t *mmap_changing, uffd_flags_t flags); extern ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start, unsigned long len, atomic_t *mmap_changing, uffd_flags_t flags); extern int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, unsigned long len, bool enable_wp, atomic_t *mmap_changing); extern long uffd_wp_range(struct vm_area_struct *vma, unsigned long start, unsigned long len, bool enable_wp); /* mm helpers */ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, struct vm_userfaultfd_ctx vm_ctx) { return vma->vm_userfaultfd_ctx.ctx == vm_ctx.ctx; } /* * Never enable huge pmd sharing on some uffd registered vmas: * * - VM_UFFD_WP VMAs, because write protect information is per pgtable entry. * * - VM_UFFD_MINOR VMAs, because otherwise we would never get minor faults for * VMAs which share huge pmds. (If you have two mappings to the same * underlying pages, and fault in the non-UFFD-registered one with a write, * with huge pmd sharing this would *also* setup the second UFFD-registered * mapping, and we'd not get minor faults.) */ static inline bool uffd_disable_huge_pmd_share(struct vm_area_struct *vma) { return vma->vm_flags & (VM_UFFD_WP | VM_UFFD_MINOR); } /* * Don't do fault around for either WP or MINOR registered uffd range. For * MINOR registered range, fault around will be a total disaster and ptes can * be installed without notifications; for WP it should mostly be fine as long * as the fault around checks for pte_none() before the installation, however * to be super safe we just forbid it. */ static inline bool uffd_disable_fault_around(struct vm_area_struct *vma) { return vma->vm_flags & (VM_UFFD_WP | VM_UFFD_MINOR); } static inline bool userfaultfd_missing(struct vm_area_struct *vma) { return vma->vm_flags & VM_UFFD_MISSING; } static inline bool userfaultfd_wp(struct vm_area_struct *vma) { return vma->vm_flags & VM_UFFD_WP; } static inline bool userfaultfd_minor(struct vm_area_struct *vma) { return vma->vm_flags & VM_UFFD_MINOR; } static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma, pte_t pte) { return userfaultfd_wp(vma) && pte_uffd_wp(pte); } static inline bool userfaultfd_huge_pmd_wp(struct vm_area_struct *vma, pmd_t pmd) { return userfaultfd_wp(vma) && pmd_uffd_wp(pmd); } static inline bool userfaultfd_armed(struct vm_area_struct *vma) { return vma->vm_flags & __VM_UFFD_FLAGS; } static inline bool vma_can_userfault(struct vm_area_struct *vma, unsigned long vm_flags) { if ((vm_flags & VM_UFFD_MINOR) && (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma))) return false; #ifndef CONFIG_PTE_MARKER_UFFD_WP /* * If user requested uffd-wp but not enabled pte markers for * uffd-wp, then shmem & hugetlbfs are not supported but only * anonymous. */ if ((vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma)) return false; #endif return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || vma_is_shmem(vma); } extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *); extern void dup_userfaultfd_complete(struct list_head *); extern void mremap_userfaultfd_prep(struct vm_area_struct *, struct vm_userfaultfd_ctx *); extern void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *, unsigned long from, unsigned long to, unsigned long len); extern bool userfaultfd_remove(struct vm_area_struct *vma, unsigned long start, unsigned long end); extern int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct list_head *uf); extern void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf); extern bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma); #else /* CONFIG_USERFAULTFD */ /* mm helpers */ static inline vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) { return VM_FAULT_SIGBUS; } static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, struct vm_userfaultfd_ctx vm_ctx) { return true; } static inline bool userfaultfd_missing(struct vm_area_struct *vma) { return false; } static inline bool userfaultfd_wp(struct vm_area_struct *vma) { return false; } static inline bool userfaultfd_minor(struct vm_area_struct *vma) { return false; } static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma, pte_t pte) { return false; } static inline bool userfaultfd_huge_pmd_wp(struct vm_area_struct *vma, pmd_t pmd) { return false; } static inline bool userfaultfd_armed(struct vm_area_struct *vma) { return false; } static inline int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *l) { return 0; } static inline void dup_userfaultfd_complete(struct list_head *l) { } static inline void mremap_userfaultfd_prep(struct vm_area_struct *vma, struct vm_userfaultfd_ctx *ctx) { } static inline void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *ctx, unsigned long from, unsigned long to, unsigned long len) { } static inline bool userfaultfd_remove(struct vm_area_struct *vma, unsigned long start, unsigned long end) { return true; } static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct list_head *uf) { return 0; } static inline void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf) { } static inline bool uffd_disable_fault_around(struct vm_area_struct *vma) { return false; } static inline bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma) { return false; } #endif /* CONFIG_USERFAULTFD */ static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) { /* Only wr-protect mode uses pte markers */ if (!userfaultfd_wp(vma)) return false; /* File-based uffd-wp always need markers */ if (!vma_is_anonymous(vma)) return true; /* * Anonymous uffd-wp only needs the markers if WP_UNPOPULATED * enabled (to apply markers on zero pages). */ return userfaultfd_wp_unpopulated(vma); } static inline bool pte_marker_entry_uffd_wp(swp_entry_t entry) { #ifdef CONFIG_PTE_MARKER_UFFD_WP return is_pte_marker_entry(entry) && (pte_marker_get(entry) & PTE_MARKER_UFFD_WP); #else return false; #endif } static inline bool pte_marker_uffd_wp(pte_t pte) { #ifdef CONFIG_PTE_MARKER_UFFD_WP swp_entry_t entry; if (!is_swap_pte(pte)) return false; entry = pte_to_swp_entry(pte); return pte_marker_entry_uffd_wp(entry); #else return false; #endif } /* * Returns true if this is a swap pte and was uffd-wp wr-protected in either * forms (pte marker or a normal swap pte), false otherwise. */ static inline bool pte_swp_uffd_wp_any(pte_t pte) { #ifdef CONFIG_PTE_MARKER_UFFD_WP if (!is_swap_pte(pte)) return false; if (pte_swp_uffd_wp(pte)) return true; if (pte_marker_uffd_wp(pte)) return true; #endif return false; } #endif /* _LINUX_USERFAULTFD_K_H */ |
2 2 2 2 7 7 1327 1327 1327 4 6 10 10 10 10 1 1 10 10 10 10 10 10 10 1 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 364 197 1 4 126 126 126 126 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 | // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Support for INET connection oriented protocols. * * Authors: See the TCP sources */ #include <linux/module.h> #include <linux/jhash.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> #include <net/inet_timewait_sock.h> #include <net/ip.h> #include <net/route.h> #include <net/tcp_states.h> #include <net/xfrm.h> #include <net/tcp.h> #include <net/sock_reuseport.h> #include <net/addrconf.h> #if IS_ENABLED(CONFIG_IPV6) /* match_sk*_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses * if IPv6 only, and any IPv4 addresses * if not IPv6 only * match_sk*_wildcard == false: addresses must be exactly the same, i.e. * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY, * and 0.0.0.0 equals to 0.0.0.0 only */ static bool ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6, const struct in6_addr *sk2_rcv_saddr6, __be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr, bool sk1_ipv6only, bool sk2_ipv6only, bool match_sk1_wildcard, bool match_sk2_wildcard) { int addr_type = ipv6_addr_type(sk1_rcv_saddr6); int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; /* if both are mapped, treat as IPv4 */ if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) { if (!sk2_ipv6only) { if (sk1_rcv_saddr == sk2_rcv_saddr) return true; return (match_sk1_wildcard && !sk1_rcv_saddr) || (match_sk2_wildcard && !sk2_rcv_saddr); } return false; } if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY) return true; if (addr_type2 == IPV6_ADDR_ANY && match_sk2_wildcard && !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) return true; if (addr_type == IPV6_ADDR_ANY && match_sk1_wildcard && !(sk1_ipv6only && addr_type2 == IPV6_ADDR_MAPPED)) return true; if (sk2_rcv_saddr6 && ipv6_addr_equal(sk1_rcv_saddr6, sk2_rcv_saddr6)) return true; return false; } #endif /* match_sk*_wildcard == true: 0.0.0.0 equals to any IPv4 addresses * match_sk*_wildcard == false: addresses must be exactly the same, i.e. * 0.0.0.0 only equals to 0.0.0.0 */ static bool ipv4_rcv_saddr_equal(__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr, bool sk2_ipv6only, bool match_sk1_wildcard, bool match_sk2_wildcard) { if (!sk2_ipv6only) { if (sk1_rcv_saddr == sk2_rcv_saddr) return true; return (match_sk1_wildcard && !sk1_rcv_saddr) || (match_sk2_wildcard && !sk2_rcv_saddr); } return false; } bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, bool match_wildcard) { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) return ipv6_rcv_saddr_equal(&sk->sk_v6_rcv_saddr, inet6_rcv_saddr(sk2), sk->sk_rcv_saddr, sk2->sk_rcv_saddr, ipv6_only_sock(sk), ipv6_only_sock(sk2), match_wildcard, match_wildcard); #endif return ipv4_rcv_saddr_equal(sk->sk_rcv_saddr, sk2->sk_rcv_saddr, ipv6_only_sock(sk2), match_wildcard, match_wildcard); } EXPORT_SYMBOL(inet_rcv_saddr_equal); bool inet_rcv_saddr_any(const struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) return ipv6_addr_any(&sk->sk_v6_rcv_saddr); #endif return !sk->sk_rcv_saddr; } void inet_get_local_port_range(const struct net *net, int *low, int *high) { unsigned int seq; do { seq = read_seqbegin(&net->ipv4.ip_local_ports.lock); *low = net->ipv4.ip_local_ports.range[0]; *high = net->ipv4.ip_local_ports.range[1]; } while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq)); } EXPORT_SYMBOL(inet_get_local_port_range); void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high) { const struct inet_sock *inet = inet_sk(sk); const struct net *net = sock_net(sk); int lo, hi, sk_lo, sk_hi; inet_get_local_port_range(net, &lo, &hi); sk_lo = inet->local_port_range.lo; sk_hi = inet->local_port_range.hi; if (unlikely(lo <= sk_lo && sk_lo <= hi)) lo = sk_lo; if (unlikely(lo <= sk_hi && sk_hi <= hi)) hi = sk_hi; *low = lo; *high = hi; } EXPORT_SYMBOL(inet_sk_get_local_port_range); static bool inet_use_bhash2_on_bind(const struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) { int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); return addr_type != IPV6_ADDR_ANY && addr_type != IPV6_ADDR_MAPPED; } #endif return sk->sk_rcv_saddr != htonl(INADDR_ANY); } static bool inet_bind_conflict(const struct sock *sk, struct sock *sk2, kuid_t sk_uid, bool relax, bool reuseport_cb_ok, bool reuseport_ok) { int bound_dev_if2; if (sk == sk2) return false; bound_dev_if2 = READ_ONCE(sk2->sk_bound_dev_if); if (!sk->sk_bound_dev_if || !bound_dev_if2 || sk->sk_bound_dev_if == bound_dev_if2) { if (sk->sk_reuse && sk2->sk_reuse && sk2->sk_state != TCP_LISTEN) { if (!relax || (!reuseport_ok && sk->sk_reuseport && sk2->sk_reuseport && reuseport_cb_ok && (sk2->sk_state == TCP_TIME_WAIT || uid_eq(sk_uid, sock_i_uid(sk2))))) return true; } else if (!reuseport_ok || !sk->sk_reuseport || !sk2->sk_reuseport || !reuseport_cb_ok || (sk2->sk_state != TCP_TIME_WAIT && !uid_eq(sk_uid, sock_i_uid(sk2)))) { return true; } } return false; } static bool __inet_bhash2_conflict(const struct sock *sk, struct sock *sk2, kuid_t sk_uid, bool relax, bool reuseport_cb_ok, bool reuseport_ok) { if (sk->sk_family == AF_INET && ipv6_only_sock(sk2)) return false; return inet_bind_conflict(sk, sk2, sk_uid, relax, reuseport_cb_ok, reuseport_ok); } static bool inet_bhash2_conflict(const struct sock *sk, const struct inet_bind2_bucket *tb2, kuid_t sk_uid, bool relax, bool reuseport_cb_ok, bool reuseport_ok) { struct inet_timewait_sock *tw2; struct sock *sk2; sk_for_each_bound_bhash2(sk2, &tb2->owners) { if (__inet_bhash2_conflict(sk, sk2, sk_uid, relax, reuseport_cb_ok, reuseport_ok)) return true; } twsk_for_each_bound_bhash2(tw2, &tb2->deathrow) { sk2 = (struct sock *)tw2; if (__inet_bhash2_conflict(sk, sk2, sk_uid, relax, reuseport_cb_ok, reuseport_ok)) return true; } return false; } /* This should be called only when the tb and tb2 hashbuckets' locks are held */ static int inet_csk_bind_conflict(const struct sock *sk, const struct inet_bind_bucket *tb, const struct inet_bind2_bucket *tb2, /* may be null */ bool relax, bool reuseport_ok) { bool reuseport_cb_ok; struct sock_reuseport *reuseport_cb; kuid_t uid = sock_i_uid((struct sock *)sk); rcu_read_lock(); reuseport_cb = rcu_dereference(sk->sk_reuseport_cb); /* paired with WRITE_ONCE() in __reuseport_(add|detach)_closed_sock */ reuseport_cb_ok = !reuseport_cb || READ_ONCE(reuseport_cb->num_closed_socks); rcu_read_unlock(); /* * Unlike other sk lookup places we do not check * for sk_net here, since _all_ the socks listed * in tb->owners and tb2->owners list belong * to the same net - the one this bucket belongs to. */ if (!inet_use_bhash2_on_bind(sk)) { struct sock *sk2; sk_for_each_bound(sk2, &tb->owners) if (inet_bind_conflict(sk, sk2, uid, relax, reuseport_cb_ok, reuseport_ok) && inet_rcv_saddr_equal(sk, sk2, true)) return true; return false; } /* Conflicts with an existing IPV6_ADDR_ANY (if ipv6) or INADDR_ANY (if * ipv4) should have been checked already. We need to do these two * checks separately because their spinlocks have to be acquired/released * independently of each other, to prevent possible deadlocks */ return tb2 && inet_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok, reuseport_ok); } /* Determine if there is a bind conflict with an existing IPV6_ADDR_ANY (if ipv6) or * INADDR_ANY (if ipv4) socket. * * Caller must hold bhash hashbucket lock with local bh disabled, to protect * against concurrent binds on the port for addr any */ static bool inet_bhash2_addr_any_conflict(const struct sock *sk, int port, int l3mdev, bool relax, bool reuseport_ok) { kuid_t uid = sock_i_uid((struct sock *)sk); const struct net *net = sock_net(sk); struct sock_reuseport *reuseport_cb; struct inet_bind_hashbucket *head2; struct inet_bind2_bucket *tb2; bool reuseport_cb_ok; rcu_read_lock(); reuseport_cb = rcu_dereference(sk->sk_reuseport_cb); /* paired with WRITE_ONCE() in __reuseport_(add|detach)_closed_sock */ reuseport_cb_ok = !reuseport_cb || READ_ONCE(reuseport_cb->num_closed_socks); rcu_read_unlock(); head2 = inet_bhash2_addr_any_hashbucket(sk, net, port); spin_lock(&head2->lock); inet_bind_bucket_for_each(tb2, &head2->chain) if (inet_bind2_bucket_match_addr_any(tb2, net, port, l3mdev, sk)) break; if (tb2 && inet_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok, reuseport_ok)) { spin_unlock(&head2->lock); return true; } spin_unlock(&head2->lock); return false; } /* * Find an open port number for the socket. Returns with the * inet_bind_hashbucket locks held if successful. */ static struct inet_bind_hashbucket * inet_csk_find_open_port(const struct sock *sk, struct inet_bind_bucket **tb_ret, struct inet_bind2_bucket **tb2_ret, struct inet_bind_hashbucket **head2_ret, int *port_ret) { struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); int i, low, high, attempt_half, port, l3mdev; struct inet_bind_hashbucket *head, *head2; struct net *net = sock_net(sk); struct inet_bind2_bucket *tb2; struct inet_bind_bucket *tb; u32 remaining, offset; bool relax = false; l3mdev = inet_sk_bound_l3mdev(sk); ports_exhausted: attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; other_half_scan: inet_sk_get_local_port_range(sk, &low, &high); high++; /* [32768, 60999] -> [32768, 61000[ */ if (high - low < 4) attempt_half = 0; if (attempt_half) { int half = low + (((high - low) >> 2) << 1); if (attempt_half == 1) high = half; else low = half; } remaining = high - low; if (likely(remaining > 1)) remaining &= ~1U; offset = get_random_u32_below(remaining); /* __inet_hash_connect() favors ports having @low parity * We do the opposite to not pollute connect() users. */ offset |= 1U; other_parity_scan: port = low + offset; for (i = 0; i < remaining; i += 2, port += 2) { if (unlikely(port >= high)) port -= remaining; if (inet_is_local_reserved_port(net, port)) continue; head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)]; spin_lock_bh(&head->lock); if (inet_use_bhash2_on_bind(sk)) { if (inet_bhash2_addr_any_conflict(sk, port, l3mdev, relax, false)) goto next_port; } head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); spin_lock(&head2->lock); tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); inet_bind_bucket_for_each(tb, &head->chain) if (inet_bind_bucket_match(tb, net, port, l3mdev)) { if (!inet_csk_bind_conflict(sk, tb, tb2, relax, false)) goto success; spin_unlock(&head2->lock); goto next_port; } tb = NULL; goto success; next_port: spin_unlock_bh(&head->lock); cond_resched(); } offset--; if (!(offset & 1)) goto other_parity_scan; if (attempt_half == 1) { /* OK we now try the upper half of the range */ attempt_half = 2; goto other_half_scan; } if (READ_ONCE(net->ipv4.sysctl_ip_autobind_reuse) && !relax) { /* We still have a chance to connect to different destinations */ relax = true; goto ports_exhausted; } return NULL; success: *port_ret = port; *tb_ret = tb; *tb2_ret = tb2; *head2_ret = head2; return head; } static inline int sk_reuseport_match(struct inet_bind_bucket *tb, struct sock *sk) { kuid_t uid = sock_i_uid(sk); if (tb->fastreuseport <= 0) return 0; if (!sk->sk_reuseport) return 0; if (rcu_access_pointer(sk->sk_reuseport_cb)) return 0; if (!uid_eq(tb->fastuid, uid)) return 0; /* We only need to check the rcv_saddr if this tb was once marked * without fastreuseport and then was reset, as we can only know that * the fast_*rcv_saddr doesn't have any conflicts with the socks on the * owners list. */ if (tb->fastreuseport == FASTREUSEPORT_ANY) return 1; #if IS_ENABLED(CONFIG_IPV6) if (tb->fast_sk_family == AF_INET6) return ipv6_rcv_saddr_equal(&tb->fast_v6_rcv_saddr, inet6_rcv_saddr(sk), tb->fast_rcv_saddr, sk->sk_rcv_saddr, tb->fast_ipv6_only, ipv6_only_sock(sk), true, false); #endif return ipv4_rcv_saddr_equal(tb->fast_rcv_saddr, sk->sk_rcv_saddr, ipv6_only_sock(sk), true, false); } void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, struct sock *sk) { kuid_t uid = sock_i_uid(sk); bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; if (hlist_empty(&tb->owners)) { tb->fastreuse = reuse; if (sk->sk_reuseport) { tb->fastreuseport = FASTREUSEPORT_ANY; tb->fastuid = uid; tb->fast_rcv_saddr = sk->sk_rcv_saddr; tb->fast_ipv6_only = ipv6_only_sock(sk); tb->fast_sk_family = sk->sk_family; #if IS_ENABLED(CONFIG_IPV6) tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; #endif } else { tb->fastreuseport = 0; } } else { if (!reuse) tb->fastreuse = 0; if (sk->sk_reuseport) { /* We didn't match or we don't have fastreuseport set on * the tb, but we have sk_reuseport set on this socket * and we know that there are no bind conflicts with * this socket in this tb, so reset our tb's reuseport * settings so that any subsequent sockets that match * our current socket will be put on the fast path. * * If we reset we need to set FASTREUSEPORT_STRICT so we * do extra checking for all subsequent sk_reuseport * socks. */ if (!sk_reuseport_match(tb, sk)) { tb->fastreuseport = FASTREUSEPORT_STRICT; tb->fastuid = uid; tb->fast_rcv_saddr = sk->sk_rcv_saddr; tb->fast_ipv6_only = ipv6_only_sock(sk); tb->fast_sk_family = sk->sk_family; #if IS_ENABLED(CONFIG_IPV6) tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; #endif } } else { tb->fastreuseport = 0; } } } /* Obtain a reference to a local port for the given sock, * if snum is zero it means select any available local port. * We try to allocate an odd port (and leave even ports for connect()) */ int inet_csk_get_port(struct sock *sk, unsigned short snum) { struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; bool found_port = false, check_bind_conflict = true; bool bhash_created = false, bhash2_created = false; int ret = -EADDRINUSE, port = snum, l3mdev; struct inet_bind_hashbucket *head, *head2; struct inet_bind2_bucket *tb2 = NULL; struct inet_bind_bucket *tb = NULL; bool head2_lock_acquired = false; struct net *net = sock_net(sk); l3mdev = inet_sk_bound_l3mdev(sk); if (!port) { head = inet_csk_find_open_port(sk, &tb, &tb2, &head2, &port); if (!head) return ret; head2_lock_acquired = true; if (tb && tb2) goto success; found_port = true; } else { head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)]; spin_lock_bh(&head->lock); inet_bind_bucket_for_each(tb, &head->chain) if (inet_bind_bucket_match(tb, net, port, l3mdev)) break; } if (!tb) { tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, net, head, port, l3mdev); if (!tb) goto fail_unlock; bhash_created = true; } if (!found_port) { if (!hlist_empty(&tb->owners)) { if (sk->sk_reuse == SK_FORCE_REUSE || (tb->fastreuse > 0 && reuse) || sk_reuseport_match(tb, sk)) check_bind_conflict = false; } if (check_bind_conflict && inet_use_bhash2_on_bind(sk)) { if (inet_bhash2_addr_any_conflict(sk, port, l3mdev, true, true)) goto fail_unlock; } head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); spin_lock(&head2->lock); head2_lock_acquired = true; tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); } if (!tb2) { tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, net, head2, port, l3mdev, sk); if (!tb2) goto fail_unlock; bhash2_created = true; } if (!found_port && check_bind_conflict) { if (inet_csk_bind_conflict(sk, tb, tb2, true, true)) goto fail_unlock; } success: inet_csk_update_fastreuse(tb, sk); if (!inet_csk(sk)->icsk_bind_hash) inet_bind_hash(sk, tb, tb2, port); WARN_ON(inet_csk(sk)->icsk_bind_hash != tb); WARN_ON(inet_csk(sk)->icsk_bind2_hash != tb2); ret = 0; fail_unlock: if (ret) { if (bhash_created) inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb); if (bhash2_created) inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, tb2); } if (head2_lock_acquired) spin_unlock(&head2->lock); spin_unlock_bh(&head->lock); return ret; } EXPORT_SYMBOL_GPL(inet_csk_get_port); /* * Wait for an incoming connection, avoid race conditions. This must be called * with the socket locked. */ static int inet_csk_wait_for_connect(struct sock *sk, long timeo) { struct inet_connection_sock *icsk = inet_csk(sk); DEFINE_WAIT(wait); int err; /* * True wake-one mechanism for incoming connections: only * one process gets woken up, not the 'whole herd'. * Since we do not 'race & poll' for established sockets * anymore, the common case will execute the loop only once. * * Subtle issue: "add_wait_queue_exclusive()" will be added * after any current non-exclusive waiters, and we know that * it will always _stay_ after any new non-exclusive waiters * because all non-exclusive waiters are added at the * beginning of the wait-queue. As such, it's ok to "drop" * our exclusiveness temporarily when we get woken up without * having to remove and re-insert us on the wait queue. */ for (;;) { prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); release_sock(sk); if (reqsk_queue_empty(&icsk->icsk_accept_queue)) timeo = schedule_timeout(timeo); sched_annotate_sleep(); lock_sock(sk); err = 0; if (!reqsk_queue_empty(&icsk->icsk_accept_queue)) break; err = -EINVAL; if (sk->sk_state != TCP_LISTEN) break; err = sock_intr_errno(timeo); if (signal_pending(current)) break; err = -EAGAIN; if (!timeo) break; } finish_wait(sk_sleep(sk), &wait); return err; } /* * This will accept the next outstanding connection. */ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern) { struct inet_connection_sock *icsk = inet_csk(sk); struct request_sock_queue *queue = &icsk->icsk_accept_queue; struct request_sock *req; struct sock *newsk; int error; lock_sock(sk); /* We need to make sure that this socket is listening, * and that it has something pending. */ error = -EINVAL; if (sk->sk_state != TCP_LISTEN) goto out_err; /* Find already established connection */ if (reqsk_queue_empty(queue)) { long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); /* If this is a non blocking socket don't sleep */ error = -EAGAIN; if (!timeo) goto out_err; error = inet_csk_wait_for_connect(sk, timeo); if (error) goto out_err; } req = reqsk_queue_remove(queue, sk); newsk = req->sk; if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { spin_lock_bh(&queue->fastopenq.lock); if (tcp_rsk(req)->tfo_listener) { /* We are still waiting for the final ACK from 3WHS * so can't free req now. Instead, we set req->sk to * NULL to signify that the child socket is taken * so reqsk_fastopen_remove() will free the req * when 3WHS finishes (or is aborted). */ req->sk = NULL; req = NULL; } spin_unlock_bh(&queue->fastopenq.lock); } out: release_sock(sk); if (newsk && mem_cgroup_sockets_enabled) { int amt = 0; /* atomically get the memory usage, set and charge the * newsk->sk_memcg. */ lock_sock(newsk); mem_cgroup_sk_alloc(newsk); if (newsk->sk_memcg) { /* The socket has not been accepted yet, no need * to look at newsk->sk_wmem_queued. */ amt = sk_mem_pages(newsk->sk_forward_alloc + atomic_read(&newsk->sk_rmem_alloc)); } if (amt) mem_cgroup_charge_skmem(newsk->sk_memcg, amt, GFP_KERNEL | __GFP_NOFAIL); release_sock(newsk); } if (req) reqsk_put(req); return newsk; out_err: newsk = NULL; req = NULL; *err = error; goto out; } EXPORT_SYMBOL(inet_csk_accept); /* * Using different timers for retransmit, delayed acks and probes * We may wish use just one timer maintaining a list of expire jiffies * to optimize. */ void inet_csk_init_xmit_timers(struct sock *sk, void (*retransmit_handler)(struct timer_list *t), void (*delack_handler)(struct timer_list *t), void (*keepalive_handler)(struct timer_list *t)) { struct inet_connection_sock *icsk = inet_csk(sk); timer_setup(&icsk->icsk_retransmit_timer, retransmit_handler, 0); timer_setup(&icsk->icsk_delack_timer, delack_handler, 0); timer_setup(&sk->sk_timer, keepalive_handler, 0); icsk->icsk_pending = icsk->icsk_ack.pending = 0; } EXPORT_SYMBOL(inet_csk_init_xmit_timers); void inet_csk_clear_xmit_timers(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_pending = icsk->icsk_ack.pending = 0; sk_stop_timer(sk, &icsk->icsk_retransmit_timer); sk_stop_timer(sk, &icsk->icsk_delack_timer); sk_stop_timer(sk, &sk->sk_timer); } EXPORT_SYMBOL(inet_csk_clear_xmit_timers); void inet_csk_delete_keepalive_timer(struct sock *sk) { sk_stop_timer(sk, &sk->sk_timer); } EXPORT_SYMBOL(inet_csk_delete_keepalive_timer); void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len) { sk_reset_timer(sk, &sk->sk_timer, jiffies + len); } EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); struct dst_entry *inet_csk_route_req(const struct sock *sk, struct flowi4 *fl4, const struct request_sock *req) { const struct inet_request_sock *ireq = inet_rsk(req); struct net *net = read_pnet(&ireq->ireq_net); struct ip_options_rcu *opt; struct rtable *rt; rcu_read_lock(); opt = rcu_dereference(ireq->ireq_opt); flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), sk->sk_protocol, inet_sk_flowi_flags(sk), (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, ireq->ir_rmt_port, htons(ireq->ir_num), sk->sk_uid); security_req_classify_flow(req, flowi4_to_flowi_common(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) goto route_err; rcu_read_unlock(); return &rt->dst; route_err: ip_rt_put(rt); no_route: rcu_read_unlock(); __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } EXPORT_SYMBOL_GPL(inet_csk_route_req); struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, struct sock *newsk, const struct request_sock *req) { const struct inet_request_sock *ireq = inet_rsk(req); struct net *net = read_pnet(&ireq->ireq_net); struct inet_sock *newinet = inet_sk(newsk); struct ip_options_rcu *opt; struct flowi4 *fl4; struct rtable *rt; opt = rcu_dereference(ireq->ireq_opt); fl4 = &newinet->cork.fl.u.ip4; flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), sk->sk_protocol, inet_sk_flowi_flags(sk), (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, ireq->ir_rmt_port, htons(ireq->ir_num), sk->sk_uid); security_req_classify_flow(req, flowi4_to_flowi_common(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) goto route_err; return &rt->dst; route_err: ip_rt_put(rt); no_route: __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } EXPORT_SYMBOL_GPL(inet_csk_route_child_sock); /* Decide when to expire the request and when to resend SYN-ACK */ static void syn_ack_recalc(struct request_sock *req, const int max_syn_ack_retries, const u8 rskq_defer_accept, int *expire, int *resend) { if (!rskq_defer_accept) { *expire = req->num_timeout >= max_syn_ack_retries; *resend = 1; return; } *expire = req->num_timeout >= max_syn_ack_retries && (!inet_rsk(req)->acked || req->num_timeout >= rskq_defer_accept); /* Do not resend while waiting for data after ACK, * start to resend on end of deferring period to give * last chance for data or ACK to create established socket. */ *resend = !inet_rsk(req)->acked || req->num_timeout >= rskq_defer_accept - 1; } int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req) { int err = req->rsk_ops->rtx_syn_ack(parent, req); if (!err) req->num_retrans++; return err; } EXPORT_SYMBOL(inet_rtx_syn_ack); static struct request_sock *inet_reqsk_clone(struct request_sock *req, struct sock *sk) { struct sock *req_sk, *nreq_sk; struct request_sock *nreq; nreq = kmem_cache_alloc(req->rsk_ops->slab, GFP_ATOMIC | __GFP_NOWARN); if (!nreq) { __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); /* paired with refcount_inc_not_zero() in reuseport_migrate_sock() */ sock_put(sk); return NULL; } req_sk = req_to_sk(req); nreq_sk = req_to_sk(nreq); memcpy(nreq_sk, req_sk, offsetof(struct sock, sk_dontcopy_begin)); memcpy(&nreq_sk->sk_dontcopy_end, &req_sk->sk_dontcopy_end, req->rsk_ops->obj_size - offsetof(struct sock, sk_dontcopy_end)); sk_node_init(&nreq_sk->sk_node); nreq_sk->sk_tx_queue_mapping = req_sk->sk_tx_queue_mapping; #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING nreq_sk->sk_rx_queue_mapping = req_sk->sk_rx_queue_mapping; #endif nreq_sk->sk_incoming_cpu = req_sk->sk_incoming_cpu; nreq->rsk_listener = sk; /* We need not acquire fastopenq->lock * because the child socket is locked in inet_csk_listen_stop(). */ if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(nreq)->tfo_listener) rcu_assign_pointer(tcp_sk(nreq->sk)->fastopen_rsk, nreq); return nreq; } static void reqsk_queue_migrated(struct request_sock_queue *queue, const struct request_sock *req) { if (req->num_timeout == 0) atomic_inc(&queue->young); atomic_inc(&queue->qlen); } static void reqsk_migrate_reset(struct request_sock *req) { req->saved_syn = NULL; #if IS_ENABLED(CONFIG_IPV6) inet_rsk(req)->ipv6_opt = NULL; inet_rsk(req)->pktopts = NULL; #else inet_rsk(req)->ireq_opt = NULL; #endif } /* return true if req was found in the ehash table */ static bool reqsk_queue_unlink(struct request_sock *req) { struct sock *sk = req_to_sk(req); bool found = false; if (sk_hashed(sk)) { struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); spinlock_t *lock = inet_ehash_lockp(hashinfo, req->rsk_hash); spin_lock(lock); found = __sk_nulls_del_node_init_rcu(sk); spin_unlock(lock); } if (timer_pending(&req->rsk_timer) && del_timer_sync(&req->rsk_timer)) reqsk_put(req); return found; } bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req) { bool unlinked = reqsk_queue_unlink(req); if (unlinked) { reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); reqsk_put(req); } return unlinked; } EXPORT_SYMBOL(inet_csk_reqsk_queue_drop); void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req) { inet_csk_reqsk_queue_drop(sk, req); reqsk_put(req); } EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put); static void reqsk_timer_handler(struct timer_list *t) { struct request_sock *req = from_timer(req, t, rsk_timer); struct request_sock *nreq = NULL, *oreq = req; struct sock *sk_listener = req->rsk_listener; struct inet_connection_sock *icsk; struct request_sock_queue *queue; struct net *net; int max_syn_ack_retries, qlen, expire = 0, resend = 0; if (inet_sk_state_load(sk_listener) != TCP_LISTEN) { struct sock *nsk; nsk = reuseport_migrate_sock(sk_listener, req_to_sk(req), NULL); if (!nsk) goto drop; nreq = inet_reqsk_clone(req, nsk); if (!nreq) goto drop; /* The new timer for the cloned req can decrease the 2 * by calling inet_csk_reqsk_queue_drop_and_put(), so * hold another count to prevent use-after-free and * call reqsk_put() just before return. */ refcount_set(&nreq->rsk_refcnt, 2 + 1); timer_setup(&nreq->rsk_timer, reqsk_timer_handler, TIMER_PINNED); reqsk_queue_migrated(&inet_csk(nsk)->icsk_accept_queue, req); req = nreq; sk_listener = nsk; } icsk = inet_csk(sk_listener); net = sock_net(sk_listener); max_syn_ack_retries = READ_ONCE(icsk->icsk_syn_retries) ? : READ_ONCE(net->ipv4.sysctl_tcp_synack_retries); /* Normally all the openreqs are young and become mature * (i.e. converted to established socket) for first timeout. * If synack was not acknowledged for 1 second, it means * one of the following things: synack was lost, ack was lost, * rtt is high or nobody planned to ack (i.e. synflood). * When server is a bit loaded, queue is populated with old * open requests, reducing effective size of queue. * When server is well loaded, queue size reduces to zero * after several minutes of work. It is not synflood, * it is normal operation. The solution is pruning * too old entries overriding normal timeout, when * situation becomes dangerous. * * Essentially, we reserve half of room for young * embrions; and abort old ones without pity, if old * ones are about to clog our table. */ queue = &icsk->icsk_accept_queue; qlen = reqsk_queue_len(queue); if ((qlen << 1) > max(8U, READ_ONCE(sk_listener->sk_max_ack_backlog))) { int young = reqsk_queue_len_young(queue) << 1; while (max_syn_ack_retries > 2) { if (qlen < young) break; max_syn_ack_retries--; young <<= 1; } } syn_ack_recalc(req, max_syn_ack_retries, READ_ONCE(queue->rskq_defer_accept), &expire, &resend); req->rsk_ops->syn_ack_timeout(req); if (!expire && (!resend || !inet_rtx_syn_ack(sk_listener, req) || inet_rsk(req)->acked)) { if (req->num_timeout++ == 0) atomic_dec(&queue->young); mod_timer(&req->rsk_timer, jiffies + reqsk_timeout(req, TCP_RTO_MAX)); if (!nreq) return; if (!inet_ehash_insert(req_to_sk(nreq), req_to_sk(oreq), NULL)) { /* delete timer */ inet_csk_reqsk_queue_drop(sk_listener, nreq); goto no_ownership; } __NET_INC_STATS(net, LINUX_MIB_TCPMIGRATEREQSUCCESS); reqsk_migrate_reset(oreq); reqsk_queue_removed(&inet_csk(oreq->rsk_listener)->icsk_accept_queue, oreq); reqsk_put(oreq); reqsk_put(nreq); return; } /* Even if we can clone the req, we may need not retransmit any more * SYN+ACKs (nreq->num_timeout > max_syn_ack_retries, etc), or another * CPU may win the "own_req" race so that inet_ehash_insert() fails. */ if (nreq) { __NET_INC_STATS(net, LINUX_MIB_TCPMIGRATEREQFAILURE); no_ownership: reqsk_migrate_reset(nreq); reqsk_queue_removed(queue, nreq); __reqsk_free(nreq); } drop: inet_csk_reqsk_queue_drop_and_put(oreq->rsk_listener, oreq); } static void reqsk_queue_hash_req(struct request_sock *req, unsigned long timeout) { timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED); mod_timer(&req->rsk_timer, jiffies + timeout); inet_ehash_insert(req_to_sk(req), NULL, NULL); /* before letting lookups find us, make sure all req fields * are committed to memory and refcnt initialized. */ smp_wmb(); refcount_set(&req->rsk_refcnt, 2 + 1); } void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, unsigned long timeout) { reqsk_queue_hash_req(req, timeout); inet_csk_reqsk_queue_added(sk); } EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); static void inet_clone_ulp(const struct request_sock *req, struct sock *newsk, const gfp_t priority) { struct inet_connection_sock *icsk = inet_csk(newsk); if (!icsk->icsk_ulp_ops) return; icsk->icsk_ulp_ops->clone(req, newsk, priority); } /** * inet_csk_clone_lock - clone an inet socket, and lock its clone * @sk: the socket to clone * @req: request_sock * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) * * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) */ struct sock *inet_csk_clone_lock(const struct sock *sk, const struct request_sock *req, const gfp_t priority) { struct sock *newsk = sk_clone_lock(sk, priority); if (newsk) { struct inet_connection_sock *newicsk = inet_csk(newsk); inet_sk_set_state(newsk, TCP_SYN_RECV); newicsk->icsk_bind_hash = NULL; newicsk->icsk_bind2_hash = NULL; inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port; inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num; inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num); /* listeners have SOCK_RCU_FREE, not the children */ sock_reset_flag(newsk, SOCK_RCU_FREE); inet_sk(newsk)->mc_list = NULL; newsk->sk_mark = inet_rsk(req)->ir_mark; atomic64_set(&newsk->sk_cookie, atomic64_read(&inet_rsk(req)->ir_cookie)); newicsk->icsk_retransmits = 0; newicsk->icsk_backoff = 0; newicsk->icsk_probes_out = 0; newicsk->icsk_probes_tstamp = 0; /* Deinitialize accept_queue to trap illegal accesses. */ memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue)); inet_clone_ulp(req, newsk, priority); security_inet_csk_clone(newsk, req); } return newsk; } EXPORT_SYMBOL_GPL(inet_csk_clone_lock); /* * At this point, there should be no process reference to this * socket, and thus no user references at all. Therefore we * can assume the socket waitqueue is inactive and nobody will * try to jump onto it. */ void inet_csk_destroy_sock(struct sock *sk) { WARN_ON(sk->sk_state != TCP_CLOSE); WARN_ON(!sock_flag(sk, SOCK_DEAD)); /* It cannot be in hash table! */ WARN_ON(!sk_unhashed(sk)); /* If it has not 0 inet_sk(sk)->inet_num, it must be bound */ WARN_ON(inet_sk(sk)->inet_num && !inet_csk(sk)->icsk_bind_hash); sk->sk_prot->destroy(sk); sk_stream_kill_queues(sk); xfrm_sk_free_policy(sk); this_cpu_dec(*sk->sk_prot->orphan_count); sock_put(sk); } EXPORT_SYMBOL(inet_csk_destroy_sock); /* This function allows to force a closure of a socket after the call to * tcp/dccp_create_openreq_child(). */ void inet_csk_prepare_forced_close(struct sock *sk) __releases(&sk->sk_lock.slock) { /* sk_clone_lock locked the socket and set refcnt to 2 */ bh_unlock_sock(sk); sock_put(sk); inet_csk_prepare_for_destroy_sock(sk); inet_sk(sk)->inet_num = 0; } EXPORT_SYMBOL(inet_csk_prepare_forced_close); static int inet_ulp_can_listen(const struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ulp_ops && !icsk->icsk_ulp_ops->clone) return -EINVAL; return 0; } int inet_csk_listen_start(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet = inet_sk(sk); int err; err = inet_ulp_can_listen(sk); if (unlikely(err)) return err; reqsk_queue_alloc(&icsk->icsk_accept_queue); sk->sk_ack_backlog = 0; inet_csk_delack_init(sk); /* There is race window here: we announce ourselves listening, * but this transition is still not validated by get_port(). * It is OK, because this socket enters to hash table only * after validation is complete. */ inet_sk_state_store(sk, TCP_LISTEN); err = sk->sk_prot->get_port(sk, inet->inet_num); if (!err) { inet->inet_sport = htons(inet->inet_num); sk_dst_reset(sk); err = sk->sk_prot->hash(sk); if (likely(!err)) return 0; } inet_sk_set_state(sk, TCP_CLOSE); return err; } EXPORT_SYMBOL_GPL(inet_csk_listen_start); static void inet_child_forget(struct sock *sk, struct request_sock *req, struct sock *child) { sk->sk_prot->disconnect(child, O_NONBLOCK); sock_orphan(child); this_cpu_inc(*sk->sk_prot->orphan_count); if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { BUG_ON(rcu_access_pointer(tcp_sk(child)->fastopen_rsk) != req); BUG_ON(sk != req->rsk_listener); /* Paranoid, to prevent race condition if * an inbound pkt destined for child is * blocked by sock lock in tcp_v4_rcv(). * Also to satisfy an assertion in * tcp_v4_destroy_sock(). */ RCU_INIT_POINTER(tcp_sk(child)->fastopen_rsk, NULL); } inet_csk_destroy_sock(child); } struct sock *inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req, struct sock *child) { struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; spin_lock(&queue->rskq_lock); if (unlikely(sk->sk_state != TCP_LISTEN)) { inet_child_forget(sk, req, child); child = NULL; } else { req->sk = child; req->dl_next = NULL; if (queue->rskq_accept_head == NULL) WRITE_ONCE(queue->rskq_accept_head, req); else queue->rskq_accept_tail->dl_next = req; queue->rskq_accept_tail = req; sk_acceptq_added(sk); } spin_unlock(&queue->rskq_lock); return child; } EXPORT_SYMBOL(inet_csk_reqsk_queue_add); struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child, struct request_sock *req, bool own_req) { if (own_req) { inet_csk_reqsk_queue_drop(req->rsk_listener, req); reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req); if (sk != req->rsk_listener) { /* another listening sk has been selected, * migrate the req to it. */ struct request_sock *nreq; /* hold a refcnt for the nreq->rsk_listener * which is assigned in inet_reqsk_clone() */ sock_hold(sk); nreq = inet_reqsk_clone(req, sk); if (!nreq) { inet_child_forget(sk, req, child); goto child_put; } refcount_set(&nreq->rsk_refcnt, 1); if (inet_csk_reqsk_queue_add(sk, nreq, child)) { __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQSUCCESS); reqsk_migrate_reset(req); reqsk_put(req); return child; } __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); reqsk_migrate_reset(nreq); __reqsk_free(nreq); } else if (inet_csk_reqsk_queue_add(sk, req, child)) { return child; } } /* Too bad, another child took ownership of the request, undo. */ child_put: bh_unlock_sock(child); sock_put(child); return NULL; } EXPORT_SYMBOL(inet_csk_complete_hashdance); /* * This routine closes sockets which have been at least partially * opened, but not yet accepted. */ void inet_csk_listen_stop(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct request_sock_queue *queue = &icsk->icsk_accept_queue; struct request_sock *next, *req; /* Following specs, it would be better either to send FIN * (and enter FIN-WAIT-1, it is normal close) * or to send active reset (abort). * Certainly, it is pretty dangerous while synflood, but it is * bad justification for our negligence 8) * To be honest, we are not able to make either * of the variants now. --ANK */ while ((req = reqsk_queue_remove(queue, sk)) != NULL) { struct sock *child = req->sk, *nsk; struct request_sock *nreq; local_bh_disable(); bh_lock_sock(child); WARN_ON(sock_owned_by_user(child)); sock_hold(child); nsk = reuseport_migrate_sock(sk, child, NULL); if (nsk) { nreq = inet_reqsk_clone(req, nsk); if (nreq) { refcount_set(&nreq->rsk_refcnt, 1); if (inet_csk_reqsk_queue_add(nsk, nreq, child)) { __NET_INC_STATS(sock_net(nsk), LINUX_MIB_TCPMIGRATEREQSUCCESS); reqsk_migrate_reset(req); } else { __NET_INC_STATS(sock_net(nsk), LINUX_MIB_TCPMIGRATEREQFAILURE); reqsk_migrate_reset(nreq); __reqsk_free(nreq); } /* inet_csk_reqsk_queue_add() has already * called inet_child_forget() on failure case. */ goto skip_child_forget; } } inet_child_forget(sk, req, child); skip_child_forget: reqsk_put(req); bh_unlock_sock(child); local_bh_enable(); sock_put(child); cond_resched(); } if (queue->fastopenq.rskq_rst_head) { /* Free all the reqs queued in rskq_rst_head. */ spin_lock_bh(&queue->fastopenq.lock); req = queue->fastopenq.rskq_rst_head; queue->fastopenq.rskq_rst_head = NULL; spin_unlock_bh(&queue->fastopenq.lock); while (req != NULL) { next = req->dl_next; reqsk_put(req); req = next; } } WARN_ON_ONCE(sk->sk_ack_backlog); } EXPORT_SYMBOL_GPL(inet_csk_listen_stop); void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) { struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; const struct inet_sock *inet = inet_sk(sk); sin->sin_family = AF_INET; sin->sin_addr.s_addr = inet->inet_daddr; sin->sin_port = inet->inet_dport; } EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr); static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl) { const struct inet_sock *inet = inet_sk(sk); const struct ip_options_rcu *inet_opt; __be32 daddr = inet->inet_daddr; struct flowi4 *fl4; struct rtable *rt; rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt && inet_opt->opt.srr) daddr = inet_opt->opt.faddr; fl4 = &fl->u.ip4; rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr, inet->inet_dport, inet->inet_sport, sk->sk_protocol, RT_CONN_FLAGS(sk), sk->sk_bound_dev_if); if (IS_ERR(rt)) rt = NULL; if (rt) sk_setup_caps(sk, &rt->dst); rcu_read_unlock(); return &rt->dst; } struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu) { struct dst_entry *dst = __sk_dst_check(sk, 0); struct inet_sock *inet = inet_sk(sk); if (!dst) { dst = inet_csk_rebuild_route(sk, &inet->cork.fl); if (!dst) goto out; } dst->ops->update_pmtu(dst, sk, NULL, mtu, true); dst = __sk_dst_check(sk, 0); if (!dst) dst = inet_csk_rebuild_route(sk, &inet->cork.fl); out: return dst; } EXPORT_SYMBOL_GPL(inet_csk_update_pmtu); |
2245 9 9 9 14 7 7 7 14 1 1 1 1 1 2245 2245 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 | // SPDX-License-Identifier: GPL-2.0 /* * Encryption policy functions for per-file encryption support. * * Copyright (C) 2015, Google, Inc. * Copyright (C) 2015, Motorola Mobility. * * Originally written by Michael Halcrow, 2015. * Modified by Jaegeuk Kim, 2015. * Modified by Eric Biggers, 2019 for v2 policy support. */ #include <linux/fs_context.h> #include <linux/random.h> #include <linux/seq_file.h> #include <linux/string.h> #include <linux/mount.h> #include "fscrypt_private.h" /** * fscrypt_policies_equal() - check whether two encryption policies are the same * @policy1: the first policy * @policy2: the second policy * * Return: %true if equal, else %false */ bool fscrypt_policies_equal(const union fscrypt_policy *policy1, const union fscrypt_policy *policy2) { if (policy1->version != policy2->version) return false; return !memcmp(policy1, policy2, fscrypt_policy_size(policy1)); } int fscrypt_policy_to_key_spec(const union fscrypt_policy *policy, struct fscrypt_key_specifier *key_spec) { switch (policy->version) { case FSCRYPT_POLICY_V1: key_spec->type = FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR; memcpy(key_spec->u.descriptor, policy->v1.master_key_descriptor, FSCRYPT_KEY_DESCRIPTOR_SIZE); return 0; case FSCRYPT_POLICY_V2: key_spec->type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER; memcpy(key_spec->u.identifier, policy->v2.master_key_identifier, FSCRYPT_KEY_IDENTIFIER_SIZE); return 0; default: WARN_ON_ONCE(1); return -EINVAL; } } const union fscrypt_policy *fscrypt_get_dummy_policy(struct super_block *sb) { if (!sb->s_cop->get_dummy_policy) return NULL; return sb->s_cop->get_dummy_policy(sb); } /* * Return %true if the given combination of encryption modes is supported for v1 * (and later) encryption policies. * * Do *not* add anything new here, since v1 encryption policies are deprecated. * New combinations of modes should go in fscrypt_valid_enc_modes_v2() only. */ static bool fscrypt_valid_enc_modes_v1(u32 contents_mode, u32 filenames_mode) { if (contents_mode == FSCRYPT_MODE_AES_256_XTS && filenames_mode == FSCRYPT_MODE_AES_256_CTS) return true; if (contents_mode == FSCRYPT_MODE_AES_128_CBC && filenames_mode == FSCRYPT_MODE_AES_128_CTS) return true; if (contents_mode == FSCRYPT_MODE_ADIANTUM && filenames_mode == FSCRYPT_MODE_ADIANTUM) return true; return false; } static bool fscrypt_valid_enc_modes_v2(u32 contents_mode, u32 filenames_mode) { if (contents_mode == FSCRYPT_MODE_AES_256_XTS && filenames_mode == FSCRYPT_MODE_AES_256_HCTR2) return true; if (contents_mode == FSCRYPT_MODE_SM4_XTS && filenames_mode == FSCRYPT_MODE_SM4_CTS) return true; return fscrypt_valid_enc_modes_v1(contents_mode, filenames_mode); } static bool supported_direct_key_modes(const struct inode *inode, u32 contents_mode, u32 filenames_mode) { const struct fscrypt_mode *mode; if (contents_mode != filenames_mode) { fscrypt_warn(inode, "Direct key flag not allowed with different contents and filenames modes"); return false; } mode = &fscrypt_modes[contents_mode]; if (mode->ivsize < offsetofend(union fscrypt_iv, nonce)) { fscrypt_warn(inode, "Direct key flag not allowed with %s", mode->friendly_name); return false; } return true; } static bool supported_iv_ino_lblk_policy(const struct fscrypt_policy_v2 *policy, const struct inode *inode, const char *type, int max_ino_bits, int max_lblk_bits) { struct super_block *sb = inode->i_sb; int ino_bits = 64, lblk_bits = 64; /* * IV_INO_LBLK_* exist only because of hardware limitations, and * currently the only known use case for them involves AES-256-XTS. * That's also all we test currently. For these reasons, for now only * allow AES-256-XTS here. This can be relaxed later if a use case for * IV_INO_LBLK_* with other encryption modes arises. */ if (policy->contents_encryption_mode != FSCRYPT_MODE_AES_256_XTS) { fscrypt_warn(inode, "Can't use %s policy with contents mode other than AES-256-XTS", type); return false; } /* * It's unsafe to include inode numbers in the IVs if the filesystem can * potentially renumber inodes, e.g. via filesystem shrinking. */ if (!sb->s_cop->has_stable_inodes || !sb->s_cop->has_stable_inodes(sb)) { fscrypt_warn(inode, "Can't use %s policy on filesystem '%s' because it doesn't have stable inode numbers", type, sb->s_id); return false; } if (sb->s_cop->get_ino_and_lblk_bits) sb->s_cop->get_ino_and_lblk_bits(sb, &ino_bits, &lblk_bits); if (ino_bits > max_ino_bits) { fscrypt_warn(inode, "Can't use %s policy on filesystem '%s' because its inode numbers are too long", type, sb->s_id); return false; } if (lblk_bits > max_lblk_bits) { fscrypt_warn(inode, "Can't use %s policy on filesystem '%s' because its block numbers are too long", type, sb->s_id); return false; } return true; } static bool fscrypt_supported_v1_policy(const struct fscrypt_policy_v1 *policy, const struct inode *inode) { if (!fscrypt_valid_enc_modes_v1(policy->contents_encryption_mode, policy->filenames_encryption_mode)) { fscrypt_warn(inode, "Unsupported encryption modes (contents %d, filenames %d)", policy->contents_encryption_mode, policy->filenames_encryption_mode); return false; } if (policy->flags & ~(FSCRYPT_POLICY_FLAGS_PAD_MASK | FSCRYPT_POLICY_FLAG_DIRECT_KEY)) { fscrypt_warn(inode, "Unsupported encryption flags (0x%02x)", policy->flags); return false; } if ((policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) && !supported_direct_key_modes(inode, policy->contents_encryption_mode, policy->filenames_encryption_mode)) return false; if (IS_CASEFOLDED(inode)) { /* With v1, there's no way to derive dirhash keys. */ fscrypt_warn(inode, "v1 policies can't be used on casefolded directories"); return false; } return true; } static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy, const struct inode *inode) { int count = 0; if (!fscrypt_valid_enc_modes_v2(policy->contents_encryption_mode, policy->filenames_encryption_mode)) { fscrypt_warn(inode, "Unsupported encryption modes (contents %d, filenames %d)", policy->contents_encryption_mode, policy->filenames_encryption_mode); return false; } if (policy->flags & ~(FSCRYPT_POLICY_FLAGS_PAD_MASK | FSCRYPT_POLICY_FLAG_DIRECT_KEY | FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64 | FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) { fscrypt_warn(inode, "Unsupported encryption flags (0x%02x)", policy->flags); return false; } count += !!(policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY); count += !!(policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64); count += !!(policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32); if (count > 1) { fscrypt_warn(inode, "Mutually exclusive encryption flags (0x%02x)", policy->flags); return false; } if ((policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) && !supported_direct_key_modes(inode, policy->contents_encryption_mode, policy->filenames_encryption_mode)) return false; if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) && !supported_iv_ino_lblk_policy(policy, inode, "IV_INO_LBLK_64", 32, 32)) return false; /* * IV_INO_LBLK_32 hashes the inode number, so in principle it can * support any ino_bits. However, currently the inode number is gotten * from inode::i_ino which is 'unsigned long'. So for now the * implementation limit is 32 bits. */ if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) && !supported_iv_ino_lblk_policy(policy, inode, "IV_INO_LBLK_32", 32, 32)) return false; if (memchr_inv(policy->__reserved, 0, sizeof(policy->__reserved))) { fscrypt_warn(inode, "Reserved bits set in encryption policy"); return false; } return true; } /** * fscrypt_supported_policy() - check whether an encryption policy is supported * @policy_u: the encryption policy * @inode: the inode on which the policy will be used * * Given an encryption policy, check whether all its encryption modes and other * settings are supported by this kernel on the given inode. (But we don't * currently don't check for crypto API support here, so attempting to use an * algorithm not configured into the crypto API will still fail later.) * * Return: %true if supported, else %false */ bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, const struct inode *inode) { switch (policy_u->version) { case FSCRYPT_POLICY_V1: return fscrypt_supported_v1_policy(&policy_u->v1, inode); case FSCRYPT_POLICY_V2: return fscrypt_supported_v2_policy(&policy_u->v2, inode); } return false; } /** * fscrypt_new_context() - create a new fscrypt_context * @ctx_u: output context * @policy_u: input policy * @nonce: nonce to use * * Create an fscrypt_context for an inode that is being assigned the given * encryption policy. @nonce must be a new random nonce. * * Return: the size of the new context in bytes. */ static int fscrypt_new_context(union fscrypt_context *ctx_u, const union fscrypt_policy *policy_u, const u8 nonce[FSCRYPT_FILE_NONCE_SIZE]) { memset(ctx_u, 0, sizeof(*ctx_u)); switch (policy_u->version) { case FSCRYPT_POLICY_V1: { const struct fscrypt_policy_v1 *policy = &policy_u->v1; struct fscrypt_context_v1 *ctx = &ctx_u->v1; ctx->version = FSCRYPT_CONTEXT_V1; ctx->contents_encryption_mode = policy->contents_encryption_mode; ctx->filenames_encryption_mode = policy->filenames_encryption_mode; ctx->flags = policy->flags; memcpy(ctx->master_key_descriptor, policy->master_key_descriptor, sizeof(ctx->master_key_descriptor)); memcpy(ctx->nonce, nonce, FSCRYPT_FILE_NONCE_SIZE); return sizeof(*ctx); } case FSCRYPT_POLICY_V2: { const struct fscrypt_policy_v2 *policy = &policy_u->v2; struct fscrypt_context_v2 *ctx = &ctx_u->v2; ctx->version = FSCRYPT_CONTEXT_V2; ctx->contents_encryption_mode = policy->contents_encryption_mode; ctx->filenames_encryption_mode = policy->filenames_encryption_mode; ctx->flags = policy->flags; memcpy(ctx->master_key_identifier, policy->master_key_identifier, sizeof(ctx->master_key_identifier)); memcpy(ctx->nonce, nonce, FSCRYPT_FILE_NONCE_SIZE); return sizeof(*ctx); } } BUG(); } /** * fscrypt_policy_from_context() - convert an fscrypt_context to * an fscrypt_policy * @policy_u: output policy * @ctx_u: input context * @ctx_size: size of input context in bytes * * Given an fscrypt_context, build the corresponding fscrypt_policy. * * Return: 0 on success, or -EINVAL if the fscrypt_context has an unrecognized * version number or size. * * This does *not* validate the settings within the policy itself, e.g. the * modes, flags, and reserved bits. Use fscrypt_supported_policy() for that. */ int fscrypt_policy_from_context(union fscrypt_policy *policy_u, const union fscrypt_context *ctx_u, int ctx_size) { memset(policy_u, 0, sizeof(*policy_u)); if (!fscrypt_context_is_valid(ctx_u, ctx_size)) return -EINVAL; switch (ctx_u->version) { case FSCRYPT_CONTEXT_V1: { const struct fscrypt_context_v1 *ctx = &ctx_u->v1; struct fscrypt_policy_v1 *policy = &policy_u->v1; policy->version = FSCRYPT_POLICY_V1; policy->contents_encryption_mode = ctx->contents_encryption_mode; policy->filenames_encryption_mode = ctx->filenames_encryption_mode; policy->flags = ctx->flags; memcpy(policy->master_key_descriptor, ctx->master_key_descriptor, sizeof(policy->master_key_descriptor)); return 0; } case FSCRYPT_CONTEXT_V2: { const struct fscrypt_context_v2 *ctx = &ctx_u->v2; struct fscrypt_policy_v2 *policy = &policy_u->v2; policy->version = FSCRYPT_POLICY_V2; policy->contents_encryption_mode = ctx->contents_encryption_mode; policy->filenames_encryption_mode = ctx->filenames_encryption_mode; policy->flags = ctx->flags; memcpy(policy->__reserved, ctx->__reserved, sizeof(policy->__reserved)); memcpy(policy->master_key_identifier, ctx->master_key_identifier, sizeof(policy->master_key_identifier)); return 0; } } /* unreachable */ return -EINVAL; } /* Retrieve an inode's encryption policy */ static int fscrypt_get_policy(struct inode *inode, union fscrypt_policy *policy) { const struct fscrypt_info *ci; union fscrypt_context ctx; int ret; ci = fscrypt_get_info(inode); if (ci) { /* key available, use the cached policy */ *policy = ci->ci_policy; return 0; } if (!IS_ENCRYPTED(inode)) return -ENODATA; ret = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); if (ret < 0) return (ret == -ERANGE) ? -EINVAL : ret; return fscrypt_policy_from_context(policy, &ctx, ret); } static int set_encryption_policy(struct inode *inode, const union fscrypt_policy *policy) { u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; union fscrypt_context ctx; int ctxsize; int err; if (!fscrypt_supported_policy(policy, inode)) return -EINVAL; switch (policy->version) { case FSCRYPT_POLICY_V1: /* * The original encryption policy version provided no way of * verifying that the correct master key was supplied, which was * insecure in scenarios where multiple users have access to the * same encrypted files (even just read-only access). The new * encryption policy version fixes this and also implies use of * an improved key derivation function and allows non-root users * to securely remove keys. So as long as compatibility with * old kernels isn't required, it is recommended to use the new * policy version for all new encrypted directories. */ pr_warn_once("%s (pid %d) is setting deprecated v1 encryption policy; recommend upgrading to v2.\n", current->comm, current->pid); break; case FSCRYPT_POLICY_V2: err = fscrypt_verify_key_added(inode->i_sb, policy->v2.master_key_identifier); if (err) return err; if (policy->v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) pr_warn_once("%s (pid %d) is setting an IV_INO_LBLK_32 encryption policy. This should only be used if there are certain hardware limitations.\n", current->comm, current->pid); break; default: WARN_ON_ONCE(1); return -EINVAL; } get_random_bytes(nonce, FSCRYPT_FILE_NONCE_SIZE); ctxsize = fscrypt_new_context(&ctx, policy, nonce); return inode->i_sb->s_cop->set_context(inode, &ctx, ctxsize, NULL); } int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) { union fscrypt_policy policy; union fscrypt_policy existing_policy; struct inode *inode = file_inode(filp); u8 version; int size; int ret; if (get_user(policy.version, (const u8 __user *)arg)) return -EFAULT; size = fscrypt_policy_size(&policy); if (size <= 0) return -EINVAL; /* * We should just copy the remaining 'size - 1' bytes here, but a * bizarre bug in gcc 7 and earlier (fixed by gcc r255731) causes gcc to * think that size can be 0 here (despite the check above!) *and* that * it's a compile-time constant. Thus it would think copy_from_user() * is passed compile-time constant ULONG_MAX, causing the compile-time * buffer overflow check to fail, breaking the build. This only occurred * when building an i386 kernel with -Os and branch profiling enabled. * * Work around it by just copying the first byte again... */ version = policy.version; if (copy_from_user(&policy, arg, size)) return -EFAULT; policy.version = version; if (!inode_owner_or_capable(&nop_mnt_idmap, inode)) return -EACCES; ret = mnt_want_write_file(filp); if (ret) return ret; inode_lock(inode); ret = fscrypt_get_policy(inode, &existing_policy); if (ret == -ENODATA) { if (!S_ISDIR(inode->i_mode)) ret = -ENOTDIR; else if (IS_DEADDIR(inode)) ret = -ENOENT; else if (!inode->i_sb->s_cop->empty_dir(inode)) ret = -ENOTEMPTY; else ret = set_encryption_policy(inode, &policy); } else if (ret == -EINVAL || (ret == 0 && !fscrypt_policies_equal(&policy, &existing_policy))) { /* The file already uses a different encryption policy. */ ret = -EEXIST; } inode_unlock(inode); mnt_drop_write_file(filp); return ret; } EXPORT_SYMBOL(fscrypt_ioctl_set_policy); /* Original ioctl version; can only get the original policy version */ int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) { union fscrypt_policy policy; int err; err = fscrypt_get_policy(file_inode(filp), &policy); if (err) return err; if (policy.version != FSCRYPT_POLICY_V1) return -EINVAL; if (copy_to_user(arg, &policy, sizeof(policy.v1))) return -EFAULT; return 0; } EXPORT_SYMBOL(fscrypt_ioctl_get_policy); /* Extended ioctl version; can get policies of any version */ int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *uarg) { struct fscrypt_get_policy_ex_arg arg; union fscrypt_policy *policy = (union fscrypt_policy *)&arg.policy; size_t policy_size; int err; /* arg is policy_size, then policy */ BUILD_BUG_ON(offsetof(typeof(arg), policy_size) != 0); BUILD_BUG_ON(offsetofend(typeof(arg), policy_size) != offsetof(typeof(arg), policy)); BUILD_BUG_ON(sizeof(arg.policy) != sizeof(*policy)); err = fscrypt_get_policy(file_inode(filp), policy); if (err) return err; policy_size = fscrypt_policy_size(policy); if (copy_from_user(&arg, uarg, sizeof(arg.policy_size))) return -EFAULT; if (policy_size > arg.policy_size) return -EOVERFLOW; arg.policy_size = policy_size; if (copy_to_user(uarg, &arg, sizeof(arg.policy_size) + policy_size)) return -EFAULT; return 0; } EXPORT_SYMBOL_GPL(fscrypt_ioctl_get_policy_ex); /* FS_IOC_GET_ENCRYPTION_NONCE: retrieve file's encryption nonce for testing */ int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg) { struct inode *inode = file_inode(filp); union fscrypt_context ctx; int ret; ret = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); if (ret < 0) return ret; if (!fscrypt_context_is_valid(&ctx, ret)) return -EINVAL; if (copy_to_user(arg, fscrypt_context_nonce(&ctx), FSCRYPT_FILE_NONCE_SIZE)) return -EFAULT; return 0; } EXPORT_SYMBOL_GPL(fscrypt_ioctl_get_nonce); /** * fscrypt_has_permitted_context() - is a file's encryption policy permitted * within its directory? * * @parent: inode for parent directory * @child: inode for file being looked up, opened, or linked into @parent * * Filesystems must call this before permitting access to an inode in a * situation where the parent directory is encrypted (either before allowing * ->lookup() to succeed, or for a regular file before allowing it to be opened) * and before any operation that involves linking an inode into an encrypted * directory, including link, rename, and cross rename. It enforces the * constraint that within a given encrypted directory tree, all files use the * same encryption policy. The pre-access check is needed to detect potentially * malicious offline violations of this constraint, while the link and rename * checks are needed to prevent online violations of this constraint. * * Return: 1 if permitted, 0 if forbidden. */ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) { union fscrypt_policy parent_policy, child_policy; int err, err1, err2; /* No restrictions on file types which are never encrypted */ if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) && !S_ISLNK(child->i_mode)) return 1; /* No restrictions if the parent directory is unencrypted */ if (!IS_ENCRYPTED(parent)) return 1; /* Encrypted directories must not contain unencrypted files */ if (!IS_ENCRYPTED(child)) return 0; /* * Both parent and child are encrypted, so verify they use the same * encryption policy. Compare the fscrypt_info structs if the keys are * available, otherwise retrieve and compare the fscrypt_contexts. * * Note that the fscrypt_context retrieval will be required frequently * when accessing an encrypted directory tree without the key. * Performance-wise this is not a big deal because we already don't * really optimize for file access without the key (to the extent that * such access is even possible), given that any attempted access * already causes a fscrypt_context retrieval and keyring search. * * In any case, if an unexpected error occurs, fall back to "forbidden". */ err = fscrypt_get_encryption_info(parent, true); if (err) return 0; err = fscrypt_get_encryption_info(child, true); if (err) return 0; err1 = fscrypt_get_policy(parent, &parent_policy); err2 = fscrypt_get_policy(child, &child_policy); /* * Allow the case where the parent and child both have an unrecognized * encryption policy, so that files with an unrecognized encryption * policy can be deleted. */ if (err1 == -EINVAL && err2 == -EINVAL) return 1; if (err1 || err2) return 0; return fscrypt_policies_equal(&parent_policy, &child_policy); } EXPORT_SYMBOL(fscrypt_has_permitted_context); /* * Return the encryption policy that new files in the directory will inherit, or * NULL if none, or an ERR_PTR() on error. If the directory is encrypted, also * ensure that its key is set up, so that the new filename can be encrypted. */ const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir) { int err; if (IS_ENCRYPTED(dir)) { err = fscrypt_require_key(dir); if (err) return ERR_PTR(err); return &dir->i_crypt_info->ci_policy; } return fscrypt_get_dummy_policy(dir->i_sb); } /** * fscrypt_context_for_new_inode() - create an encryption context for a new inode * @ctx: where context should be written * @inode: inode from which to fetch policy and nonce * * Given an in-core "prepared" (via fscrypt_prepare_new_inode) inode, * generate a new context and write it to ctx. ctx _must_ be at least * FSCRYPT_SET_CONTEXT_MAX_SIZE bytes. * * Return: size of the resulting context or a negative error code. */ int fscrypt_context_for_new_inode(void *ctx, struct inode *inode) { struct fscrypt_info *ci = inode->i_crypt_info; BUILD_BUG_ON(sizeof(union fscrypt_context) != FSCRYPT_SET_CONTEXT_MAX_SIZE); /* fscrypt_prepare_new_inode() should have set up the key already. */ if (WARN_ON_ONCE(!ci)) return -ENOKEY; return fscrypt_new_context(ctx, &ci->ci_policy, ci->ci_nonce); } EXPORT_SYMBOL_GPL(fscrypt_context_for_new_inode); /** * fscrypt_set_context() - Set the fscrypt context of a new inode * @inode: a new inode * @fs_data: private data given by FS and passed to ->set_context() * * This should be called after fscrypt_prepare_new_inode(), generally during a * filesystem transaction. Everything here must be %GFP_NOFS-safe. * * Return: 0 on success, -errno on failure */ int fscrypt_set_context(struct inode *inode, void *fs_data) { struct fscrypt_info *ci = inode->i_crypt_info; union fscrypt_context ctx; int ctxsize; ctxsize = fscrypt_context_for_new_inode(&ctx, inode); if (ctxsize < 0) return ctxsize; /* * This may be the first time the inode number is available, so do any * delayed key setup that requires the inode number. */ if (ci->ci_policy.version == FSCRYPT_POLICY_V2 && (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) fscrypt_hash_inode_number(ci, ci->ci_master_key); return inode->i_sb->s_cop->set_context(inode, &ctx, ctxsize, fs_data); } EXPORT_SYMBOL_GPL(fscrypt_set_context); /** * fscrypt_parse_test_dummy_encryption() - parse the test_dummy_encryption mount option * @param: the mount option * @dummy_policy: (input/output) the place to write the dummy policy that will * result from parsing the option. Zero-initialize this. If a policy is * already set here (due to test_dummy_encryption being given multiple * times), then this function will verify that the policies are the same. * * Return: 0 on success; -EINVAL if the argument is invalid; -EEXIST if the * argument conflicts with one already specified; or -ENOMEM. */ int fscrypt_parse_test_dummy_encryption(const struct fs_parameter *param, struct fscrypt_dummy_policy *dummy_policy) { const char *arg = "v2"; union fscrypt_policy *policy; int err; if (param->type == fs_value_is_string && *param->string) arg = param->string; policy = kzalloc(sizeof(*policy), GFP_KERNEL); if (!policy) return -ENOMEM; if (!strcmp(arg, "v1")) { policy->version = FSCRYPT_POLICY_V1; policy->v1.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; policy->v1.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; memset(policy->v1.master_key_descriptor, 0x42, FSCRYPT_KEY_DESCRIPTOR_SIZE); } else if (!strcmp(arg, "v2")) { policy->version = FSCRYPT_POLICY_V2; policy->v2.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; policy->v2.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; err = fscrypt_get_test_dummy_key_identifier( policy->v2.master_key_identifier); if (err) goto out; } else { err = -EINVAL; goto out; } if (dummy_policy->policy) { if (fscrypt_policies_equal(policy, dummy_policy->policy)) err = 0; else err = -EEXIST; goto out; } dummy_policy->policy = policy; policy = NULL; err = 0; out: kfree(policy); return err; } EXPORT_SYMBOL_GPL(fscrypt_parse_test_dummy_encryption); /** * fscrypt_dummy_policies_equal() - check whether two dummy policies are equal * @p1: the first test dummy policy (may be unset) * @p2: the second test dummy policy (may be unset) * * Return: %true if the dummy policies are both set and equal, or both unset. */ bool fscrypt_dummy_policies_equal(const struct fscrypt_dummy_policy *p1, const struct fscrypt_dummy_policy *p2) { if (!p1->policy && !p2->policy) return true; if (!p1->policy || !p2->policy) return false; return fscrypt_policies_equal(p1->policy, p2->policy); } EXPORT_SYMBOL_GPL(fscrypt_dummy_policies_equal); /** * fscrypt_show_test_dummy_encryption() - show '-o test_dummy_encryption' * @seq: the seq_file to print the option to * @sep: the separator character to use * @sb: the filesystem whose options are being shown * * Show the test_dummy_encryption mount option, if it was specified. * This is mainly used for /proc/mounts. */ void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep, struct super_block *sb) { const union fscrypt_policy *policy = fscrypt_get_dummy_policy(sb); int vers; if (!policy) return; vers = policy->version; if (vers == FSCRYPT_POLICY_V1) /* Handle numbering quirk */ vers = 1; seq_printf(seq, "%ctest_dummy_encryption=v%d", sep, vers); } EXPORT_SYMBOL_GPL(fscrypt_show_test_dummy_encryption); |
2244 2244 2244 2244 2447 2447 2447 100 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 | // SPDX-License-Identifier: GPL-2.0 /* * This contains functions for filename crypto management * * Copyright (C) 2015, Google, Inc. * Copyright (C) 2015, Motorola Mobility * * Written by Uday Savagaonkar, 2014. * Modified by Jaegeuk Kim, 2015. * * This has not yet undergone a rigorous security audit. */ #include <linux/namei.h> #include <linux/scatterlist.h> #include <crypto/hash.h> #include <crypto/sha2.h> #include <crypto/skcipher.h> #include "fscrypt_private.h" /* * The minimum message length (input and output length), in bytes, for all * filenames encryption modes. Filenames shorter than this will be zero-padded * before being encrypted. */ #define FSCRYPT_FNAME_MIN_MSG_LEN 16 /* * struct fscrypt_nokey_name - identifier for directory entry when key is absent * * When userspace lists an encrypted directory without access to the key, the * filesystem must present a unique "no-key name" for each filename that allows * it to find the directory entry again if requested. Naively, that would just * mean using the ciphertext filenames. However, since the ciphertext filenames * can contain illegal characters ('\0' and '/'), they must be encoded in some * way. We use base64url. But that can cause names to exceed NAME_MAX (255 * bytes), so we also need to use a strong hash to abbreviate long names. * * The filesystem may also need another kind of hash, the "dirhash", to quickly * find the directory entry. Since filesystems normally compute the dirhash * over the on-disk filename (i.e. the ciphertext), it's not computable from * no-key names that abbreviate the ciphertext using the strong hash to fit in * NAME_MAX. It's also not computable if it's a keyed hash taken over the * plaintext (but it may still be available in the on-disk directory entry); * casefolded directories use this type of dirhash. At least in these cases, * each no-key name must include the name's dirhash too. * * To meet all these requirements, we base64url-encode the following * variable-length structure. It contains the dirhash, or 0's if the filesystem * didn't provide one; up to 149 bytes of the ciphertext name; and for * ciphertexts longer than 149 bytes, also the SHA-256 of the remaining bytes. * * This ensures that each no-key name contains everything needed to find the * directory entry again, contains only legal characters, doesn't exceed * NAME_MAX, is unambiguous unless there's a SHA-256 collision, and that we only * take the performance hit of SHA-256 on very long filenames (which are rare). */ struct fscrypt_nokey_name { u32 dirhash[2]; u8 bytes[149]; u8 sha256[SHA256_DIGEST_SIZE]; }; /* 189 bytes => 252 bytes base64url-encoded, which is <= NAME_MAX (255) */ /* * Decoded size of max-size no-key name, i.e. a name that was abbreviated using * the strong hash and thus includes the 'sha256' field. This isn't simply * sizeof(struct fscrypt_nokey_name), as the padding at the end isn't included. */ #define FSCRYPT_NOKEY_NAME_MAX offsetofend(struct fscrypt_nokey_name, sha256) /* Encoded size of max-size no-key name */ #define FSCRYPT_NOKEY_NAME_MAX_ENCODED \ FSCRYPT_BASE64URL_CHARS(FSCRYPT_NOKEY_NAME_MAX) static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) { if (str->len == 1 && str->name[0] == '.') return true; if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.') return true; return false; } /** * fscrypt_fname_encrypt() - encrypt a filename * @inode: inode of the parent directory (for regular filenames) * or of the symlink (for symlink targets). Key must already be * set up. * @iname: the filename to encrypt * @out: (output) the encrypted filename * @olen: size of the encrypted filename. It must be at least @iname->len. * Any extra space is filled with NUL padding before encryption. * * Return: 0 on success, -errno on failure */ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, u8 *out, unsigned int olen) { struct skcipher_request *req = NULL; DECLARE_CRYPTO_WAIT(wait); const struct fscrypt_info *ci = inode->i_crypt_info; struct crypto_skcipher *tfm = ci->ci_enc_key.tfm; union fscrypt_iv iv; struct scatterlist sg; int res; /* * Copy the filename to the output buffer for encrypting in-place and * pad it with the needed number of NUL bytes. */ if (WARN_ON_ONCE(olen < iname->len)) return -ENOBUFS; memcpy(out, iname->name, iname->len); memset(out + iname->len, 0, olen - iname->len); /* Initialize the IV */ fscrypt_generate_iv(&iv, 0, ci); /* Set up the encryption request */ req = skcipher_request_alloc(tfm, GFP_NOFS); if (!req) return -ENOMEM; skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, &wait); sg_init_one(&sg, out, olen); skcipher_request_set_crypt(req, &sg, &sg, olen, &iv); /* Do the encryption */ res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); skcipher_request_free(req); if (res < 0) { fscrypt_err(inode, "Filename encryption failed: %d", res); return res; } return 0; } EXPORT_SYMBOL_GPL(fscrypt_fname_encrypt); /** * fname_decrypt() - decrypt a filename * @inode: inode of the parent directory (for regular filenames) * or of the symlink (for symlink targets) * @iname: the encrypted filename to decrypt * @oname: (output) the decrypted filename. The caller must have allocated * enough space for this, e.g. using fscrypt_fname_alloc_buffer(). * * Return: 0 on success, -errno on failure */ static int fname_decrypt(const struct inode *inode, const struct fscrypt_str *iname, struct fscrypt_str *oname) { struct skcipher_request *req = NULL; DECLARE_CRYPTO_WAIT(wait); struct scatterlist src_sg, dst_sg; const struct fscrypt_info *ci = inode->i_crypt_info; struct crypto_skcipher *tfm = ci->ci_enc_key.tfm; union fscrypt_iv iv; int res; /* Allocate request */ req = skcipher_request_alloc(tfm, GFP_NOFS); if (!req) return -ENOMEM; skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, &wait); /* Initialize IV */ fscrypt_generate_iv(&iv, 0, ci); /* Create decryption request */ sg_init_one(&src_sg, iname->name, iname->len); sg_init_one(&dst_sg, oname->name, oname->len); skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, &iv); res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait); skcipher_request_free(req); if (res < 0) { fscrypt_err(inode, "Filename decryption failed: %d", res); return res; } oname->len = strnlen(oname->name, iname->len); return 0; } static const char base64url_table[65] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"; #define FSCRYPT_BASE64URL_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3) /** * fscrypt_base64url_encode() - base64url-encode some binary data * @src: the binary data to encode * @srclen: the length of @src in bytes * @dst: (output) the base64url-encoded string. Not NUL-terminated. * * Encodes data using base64url encoding, i.e. the "Base 64 Encoding with URL * and Filename Safe Alphabet" specified by RFC 4648. '='-padding isn't used, * as it's unneeded and not required by the RFC. base64url is used instead of * base64 to avoid the '/' character, which isn't allowed in filenames. * * Return: the length of the resulting base64url-encoded string in bytes. * This will be equal to FSCRYPT_BASE64URL_CHARS(srclen). */ static int fscrypt_base64url_encode(const u8 *src, int srclen, char *dst) { u32 ac = 0; int bits = 0; int i; char *cp = dst; for (i = 0; i < srclen; i++) { ac = (ac << 8) | src[i]; bits += 8; do { bits -= 6; *cp++ = base64url_table[(ac >> bits) & 0x3f]; } while (bits >= 6); } if (bits) *cp++ = base64url_table[(ac << (6 - bits)) & 0x3f]; return cp - dst; } /** * fscrypt_base64url_decode() - base64url-decode a string * @src: the string to decode. Doesn't need to be NUL-terminated. * @srclen: the length of @src in bytes * @dst: (output) the decoded binary data * * Decodes a string using base64url encoding, i.e. the "Base 64 Encoding with * URL and Filename Safe Alphabet" specified by RFC 4648. '='-padding isn't * accepted, nor are non-encoding characters such as whitespace. * * This implementation hasn't been optimized for performance. * * Return: the length of the resulting decoded binary data in bytes, * or -1 if the string isn't a valid base64url string. */ static int fscrypt_base64url_decode(const char *src, int srclen, u8 *dst) { u32 ac = 0; int bits = 0; int i; u8 *bp = dst; for (i = 0; i < srclen; i++) { const char *p = strchr(base64url_table, src[i]); if (p == NULL || src[i] == 0) return -1; ac = (ac << 6) | (p - base64url_table); bits += 6; if (bits >= 8) { bits -= 8; *bp++ = (u8)(ac >> bits); } } if (ac & ((1 << bits) - 1)) return -1; return bp - dst; } bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, u32 orig_len, u32 max_len, u32 *encrypted_len_ret) { int padding = 4 << (fscrypt_policy_flags(policy) & FSCRYPT_POLICY_FLAGS_PAD_MASK); u32 encrypted_len; if (orig_len > max_len) return false; encrypted_len = max_t(u32, orig_len, FSCRYPT_FNAME_MIN_MSG_LEN); encrypted_len = round_up(encrypted_len, padding); *encrypted_len_ret = min(encrypted_len, max_len); return true; } /** * fscrypt_fname_encrypted_size() - calculate length of encrypted filename * @inode: parent inode of dentry name being encrypted. Key must * already be set up. * @orig_len: length of the original filename * @max_len: maximum length to return * @encrypted_len_ret: where calculated length should be returned (on success) * * Filenames that are shorter than the maximum length may have their lengths * increased slightly by encryption, due to padding that is applied. * * Return: false if the orig_len is greater than max_len. Otherwise, true and * fill out encrypted_len_ret with the length (up to max_len). */ bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, u32 max_len, u32 *encrypted_len_ret) { return __fscrypt_fname_encrypted_size(&inode->i_crypt_info->ci_policy, orig_len, max_len, encrypted_len_ret); } EXPORT_SYMBOL_GPL(fscrypt_fname_encrypted_size); /** * fscrypt_fname_alloc_buffer() - allocate a buffer for presented filenames * @max_encrypted_len: maximum length of encrypted filenames the buffer will be * used to present * @crypto_str: (output) buffer to allocate * * Allocate a buffer that is large enough to hold any decrypted or encoded * filename (null-terminated), for the given maximum encrypted filename length. * * Return: 0 on success, -errno on failure */ int fscrypt_fname_alloc_buffer(u32 max_encrypted_len, struct fscrypt_str *crypto_str) { u32 max_presented_len = max_t(u32, FSCRYPT_NOKEY_NAME_MAX_ENCODED, max_encrypted_len); crypto_str->name = kmalloc(max_presented_len + 1, GFP_NOFS); if (!crypto_str->name) return -ENOMEM; crypto_str->len = max_presented_len; return 0; } EXPORT_SYMBOL(fscrypt_fname_alloc_buffer); /** * fscrypt_fname_free_buffer() - free a buffer for presented filenames * @crypto_str: the buffer to free * * Free a buffer that was allocated by fscrypt_fname_alloc_buffer(). */ void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str) { if (!crypto_str) return; kfree(crypto_str->name); crypto_str->name = NULL; } EXPORT_SYMBOL(fscrypt_fname_free_buffer); /** * fscrypt_fname_disk_to_usr() - convert an encrypted filename to * user-presentable form * @inode: inode of the parent directory (for regular filenames) * or of the symlink (for symlink targets) * @hash: first part of the name's dirhash, if applicable. This only needs to * be provided if the filename is located in an indexed directory whose * encryption key may be unavailable. Not needed for symlink targets. * @minor_hash: second part of the name's dirhash, if applicable * @iname: encrypted filename to convert. May also be "." or "..", which * aren't actually encrypted. * @oname: output buffer for the user-presentable filename. The caller must * have allocated enough space for this, e.g. using * fscrypt_fname_alloc_buffer(). * * If the key is available, we'll decrypt the disk name. Otherwise, we'll * encode it for presentation in fscrypt_nokey_name format. * See struct fscrypt_nokey_name for details. * * Return: 0 on success, -errno on failure */ int fscrypt_fname_disk_to_usr(const struct inode *inode, u32 hash, u32 minor_hash, const struct fscrypt_str *iname, struct fscrypt_str *oname) { const struct qstr qname = FSTR_TO_QSTR(iname); struct fscrypt_nokey_name nokey_name; u32 size; /* size of the unencoded no-key name */ if (fscrypt_is_dot_dotdot(&qname)) { oname->name[0] = '.'; oname->name[iname->len - 1] = '.'; oname->len = iname->len; return 0; } if (iname->len < FSCRYPT_FNAME_MIN_MSG_LEN) return -EUCLEAN; if (fscrypt_has_encryption_key(inode)) return fname_decrypt(inode, iname, oname); /* * Sanity check that struct fscrypt_nokey_name doesn't have padding * between fields and that its encoded size never exceeds NAME_MAX. */ BUILD_BUG_ON(offsetofend(struct fscrypt_nokey_name, dirhash) != offsetof(struct fscrypt_nokey_name, bytes)); BUILD_BUG_ON(offsetofend(struct fscrypt_nokey_name, bytes) != offsetof(struct fscrypt_nokey_name, sha256)); BUILD_BUG_ON(FSCRYPT_NOKEY_NAME_MAX_ENCODED > NAME_MAX); nokey_name.dirhash[0] = hash; nokey_name.dirhash[1] = minor_hash; if (iname->len <= sizeof(nokey_name.bytes)) { memcpy(nokey_name.bytes, iname->name, iname->len); size = offsetof(struct fscrypt_nokey_name, bytes[iname->len]); } else { memcpy(nokey_name.bytes, iname->name, sizeof(nokey_name.bytes)); /* Compute strong hash of remaining part of name. */ sha256(&iname->name[sizeof(nokey_name.bytes)], iname->len - sizeof(nokey_name.bytes), nokey_name.sha256); size = FSCRYPT_NOKEY_NAME_MAX; } oname->len = fscrypt_base64url_encode((const u8 *)&nokey_name, size, oname->name); return 0; } EXPORT_SYMBOL(fscrypt_fname_disk_to_usr); /** * fscrypt_setup_filename() - prepare to search a possibly encrypted directory * @dir: the directory that will be searched * @iname: the user-provided filename being searched for * @lookup: 1 if we're allowed to proceed without the key because it's * ->lookup() or we're finding the dir_entry for deletion; 0 if we cannot * proceed without the key because we're going to create the dir_entry. * @fname: the filename information to be filled in * * Given a user-provided filename @iname, this function sets @fname->disk_name * to the name that would be stored in the on-disk directory entry, if possible. * If the directory is unencrypted this is simply @iname. Else, if we have the * directory's encryption key, then @iname is the plaintext, so we encrypt it to * get the disk_name. * * Else, for keyless @lookup operations, @iname should be a no-key name, so we * decode it to get the struct fscrypt_nokey_name. Non-@lookup operations will * be impossible in this case, so we fail them with ENOKEY. * * If successful, fscrypt_free_filename() must be called later to clean up. * * Return: 0 on success, -errno on failure */ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct fscrypt_name *fname) { struct fscrypt_nokey_name *nokey_name; int ret; memset(fname, 0, sizeof(struct fscrypt_name)); fname->usr_fname = iname; if (!IS_ENCRYPTED(dir) || fscrypt_is_dot_dotdot(iname)) { fname->disk_name.name = (unsigned char *)iname->name; fname->disk_name.len = iname->len; return 0; } ret = fscrypt_get_encryption_info(dir, lookup); if (ret) return ret; if (fscrypt_has_encryption_key(dir)) { if (!fscrypt_fname_encrypted_size(dir, iname->len, NAME_MAX, &fname->crypto_buf.len)) return -ENAMETOOLONG; fname->crypto_buf.name = kmalloc(fname->crypto_buf.len, GFP_NOFS); if (!fname->crypto_buf.name) return -ENOMEM; ret = fscrypt_fname_encrypt(dir, iname, fname->crypto_buf.name, fname->crypto_buf.len); if (ret) goto errout; fname->disk_name.name = fname->crypto_buf.name; fname->disk_name.len = fname->crypto_buf.len; return 0; } if (!lookup) return -ENOKEY; fname->is_nokey_name = true; /* * We don't have the key and we are doing a lookup; decode the * user-supplied name */ if (iname->len > FSCRYPT_NOKEY_NAME_MAX_ENCODED) return -ENOENT; fname->crypto_buf.name = kmalloc(FSCRYPT_NOKEY_NAME_MAX, GFP_KERNEL); if (fname->crypto_buf.name == NULL) return -ENOMEM; ret = fscrypt_base64url_decode(iname->name, iname->len, fname->crypto_buf.name); if (ret < (int)offsetof(struct fscrypt_nokey_name, bytes[1]) || (ret > offsetof(struct fscrypt_nokey_name, sha256) && ret != FSCRYPT_NOKEY_NAME_MAX)) { ret = -ENOENT; goto errout; } fname->crypto_buf.len = ret; nokey_name = (void *)fname->crypto_buf.name; fname->hash = nokey_name->dirhash[0]; fname->minor_hash = nokey_name->dirhash[1]; if (ret != FSCRYPT_NOKEY_NAME_MAX) { /* The full ciphertext filename is available. */ fname->disk_name.name = nokey_name->bytes; fname->disk_name.len = ret - offsetof(struct fscrypt_nokey_name, bytes); } return 0; errout: kfree(fname->crypto_buf.name); return ret; } EXPORT_SYMBOL(fscrypt_setup_filename); /** * fscrypt_match_name() - test whether the given name matches a directory entry * @fname: the name being searched for * @de_name: the name from the directory entry * @de_name_len: the length of @de_name in bytes * * Normally @fname->disk_name will be set, and in that case we simply compare * that to the name stored in the directory entry. The only exception is that * if we don't have the key for an encrypted directory and the name we're * looking for is very long, then we won't have the full disk_name and instead * we'll need to match against a fscrypt_nokey_name that includes a strong hash. * * Return: %true if the name matches, otherwise %false. */ bool fscrypt_match_name(const struct fscrypt_name *fname, const u8 *de_name, u32 de_name_len) { const struct fscrypt_nokey_name *nokey_name = (const void *)fname->crypto_buf.name; u8 digest[SHA256_DIGEST_SIZE]; if (likely(fname->disk_name.name)) { if (de_name_len != fname->disk_name.len) return false; return !memcmp(de_name, fname->disk_name.name, de_name_len); } if (de_name_len <= sizeof(nokey_name->bytes)) return false; if (memcmp(de_name, nokey_name->bytes, sizeof(nokey_name->bytes))) return false; sha256(&de_name[sizeof(nokey_name->bytes)], de_name_len - sizeof(nokey_name->bytes), digest); return !memcmp(digest, nokey_name->sha256, sizeof(digest)); } EXPORT_SYMBOL_GPL(fscrypt_match_name); /** * fscrypt_fname_siphash() - calculate the SipHash of a filename * @dir: the parent directory * @name: the filename to calculate the SipHash of * * Given a plaintext filename @name and a directory @dir which uses SipHash as * its dirhash method and has had its fscrypt key set up, this function * calculates the SipHash of that name using the directory's secret dirhash key. * * Return: the SipHash of @name using the hash key of @dir */ u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name) { const struct fscrypt_info *ci = dir->i_crypt_info; WARN_ON_ONCE(!ci->ci_dirhash_key_initialized); return siphash(name->name, name->len, &ci->ci_dirhash_key); } EXPORT_SYMBOL_GPL(fscrypt_fname_siphash); /* * Validate dentries in encrypted directories to make sure we aren't potentially * caching stale dentries after a key has been added. */ int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) { struct dentry *dir; int err; int valid; /* * Plaintext names are always valid, since fscrypt doesn't support * reverting to no-key names without evicting the directory's inode * -- which implies eviction of the dentries in the directory. */ if (!(dentry->d_flags & DCACHE_NOKEY_NAME)) return 1; /* * No-key name; valid if the directory's key is still unavailable. * * Although fscrypt forbids rename() on no-key names, we still must use * dget_parent() here rather than use ->d_parent directly. That's * because a corrupted fs image may contain directory hard links, which * the VFS handles by moving the directory's dentry tree in the dcache * each time ->lookup() finds the directory and it already has a dentry * elsewhere. Thus ->d_parent can be changing, and we must safely grab * a reference to some ->d_parent to prevent it from being freed. */ if (flags & LOOKUP_RCU) return -ECHILD; dir = dget_parent(dentry); /* * Pass allow_unsupported=true, so that files with an unsupported * encryption policy can be deleted. */ err = fscrypt_get_encryption_info(d_inode(dir), true); valid = !fscrypt_has_encryption_key(d_inode(dir)); dput(dir); if (err < 0) return err; return valid; } EXPORT_SYMBOL_GPL(fscrypt_d_revalidate); |
1038 1040 1036 18 794 1040 1039 793 1038 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/extable.h> #include <linux/uaccess.h> #include <linux/sched/debug.h> #include <linux/bitfield.h> #include <xen/xen.h> #include <asm/fpu/api.h> #include <asm/sev.h> #include <asm/traps.h> #include <asm/kdebug.h> #include <asm/insn-eval.h> #include <asm/sgx.h> static inline unsigned long *pt_regs_nr(struct pt_regs *regs, int nr) { int reg_offset = pt_regs_offset(regs, nr); static unsigned long __dummy; if (WARN_ON_ONCE(reg_offset < 0)) return &__dummy; return (unsigned long *)((unsigned long)regs + reg_offset); } static inline unsigned long ex_fixup_addr(const struct exception_table_entry *x) { return (unsigned long)&x->fixup + x->fixup; } static bool ex_handler_default(const struct exception_table_entry *e, struct pt_regs *regs) { if (e->data & EX_FLAG_CLEAR_AX) regs->ax = 0; if (e->data & EX_FLAG_CLEAR_DX) regs->dx = 0; regs->ip = ex_fixup_addr(e); return true; } /* * This is the *very* rare case where we do a "load_unaligned_zeropad()" * and it's a page crosser into a non-existent page. * * This happens when we optimistically load a pathname a word-at-a-time * and the name is less than the full word and the next page is not * mapped. Typically that only happens for CONFIG_DEBUG_PAGEALLOC. * * NOTE! The faulting address is always a 'mov mem,reg' type instruction * of size 'long', and the exception fixup must always point to right * after the instruction. */ static bool ex_handler_zeropad(const struct exception_table_entry *e, struct pt_regs *regs, unsigned long fault_addr) { struct insn insn; const unsigned long mask = sizeof(long) - 1; unsigned long offset, addr, next_ip, len; unsigned long *reg; next_ip = ex_fixup_addr(e); len = next_ip - regs->ip; if (len > MAX_INSN_SIZE) return false; if (insn_decode(&insn, (void *) regs->ip, len, INSN_MODE_KERN)) return false; if (insn.length != len) return false; if (insn.opcode.bytes[0] != 0x8b) return false; if (insn.opnd_bytes != sizeof(long)) return false; addr = (unsigned long) insn_get_addr_ref(&insn, regs); if (addr == ~0ul) return false; offset = addr & mask; addr = addr & ~mask; if (fault_addr != addr + sizeof(long)) return false; reg = insn_get_modrm_reg_ptr(&insn, regs); if (!reg) return false; *reg = *(unsigned long *)addr >> (offset * 8); return ex_handler_default(e, regs); } static bool ex_handler_fault(const struct exception_table_entry *fixup, struct pt_regs *regs, int trapnr) { regs->ax = trapnr; return ex_handler_default(fixup, regs); } static bool ex_handler_sgx(const struct exception_table_entry *fixup, struct pt_regs *regs, int trapnr) { regs->ax = trapnr | SGX_ENCLS_FAULT_FLAG; return ex_handler_default(fixup, regs); } /* * Handler for when we fail to restore a task's FPU state. We should never get * here because the FPU state of a task using the FPU (task->thread.fpu.state) * should always be valid. However, past bugs have allowed userspace to set * reserved bits in the XSAVE area using PTRACE_SETREGSET or sys_rt_sigreturn(). * These caused XRSTOR to fail when switching to the task, leaking the FPU * registers of the task previously executing on the CPU. Mitigate this class * of vulnerability by restoring from the initial state (essentially, zeroing * out all the FPU registers) if we can't restore from the task's FPU state. */ static bool ex_handler_fprestore(const struct exception_table_entry *fixup, struct pt_regs *regs) { regs->ip = ex_fixup_addr(fixup); WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.", (void *)instruction_pointer(regs)); fpu_reset_from_exception_fixup(); return true; } /* * On x86-64, we end up being imprecise with 'access_ok()', and allow * non-canonical user addresses to make the range comparisons simpler, * and to not have to worry about LAM being enabled. * * In fact, we allow up to one page of "slop" at the sign boundary, * which means that we can do access_ok() by just checking the sign * of the pointer for the common case of having a small access size. */ static bool gp_fault_address_ok(unsigned long fault_address) { #ifdef CONFIG_X86_64 /* Is it in the "user space" part of the non-canonical space? */ if (valid_user_address(fault_address)) return true; /* .. or just above it? */ fault_address -= PAGE_SIZE; if (valid_user_address(fault_address)) return true; #endif return false; } static bool ex_handler_uaccess(const struct exception_table_entry *fixup, struct pt_regs *regs, int trapnr, unsigned long fault_address) { WARN_ONCE(trapnr == X86_TRAP_GP && !gp_fault_address_ok(fault_address), "General protection fault in user access. Non-canonical address?"); return ex_handler_default(fixup, regs); } static bool ex_handler_copy(const struct exception_table_entry *fixup, struct pt_regs *regs, int trapnr) { WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?"); return ex_handler_fault(fixup, regs, trapnr); } static bool ex_handler_msr(const struct exception_table_entry *fixup, struct pt_regs *regs, bool wrmsr, bool safe, int reg) { if (__ONCE_LITE_IF(!safe && wrmsr)) { pr_warn("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n", (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax, regs->ip, (void *)regs->ip); show_stack_regs(regs); } if (__ONCE_LITE_IF(!safe && !wrmsr)) { pr_warn("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n", (unsigned int)regs->cx, regs->ip, (void *)regs->ip); show_stack_regs(regs); } if (!wrmsr) { /* Pretend that the read succeeded and returned 0. */ regs->ax = 0; regs->dx = 0; } if (safe) *pt_regs_nr(regs, reg) = -EIO; return ex_handler_default(fixup, regs); } static bool ex_handler_clear_fs(const struct exception_table_entry *fixup, struct pt_regs *regs) { if (static_cpu_has(X86_BUG_NULL_SEG)) asm volatile ("mov %0, %%fs" : : "rm" (__USER_DS)); asm volatile ("mov %0, %%fs" : : "rm" (0)); return ex_handler_default(fixup, regs); } static bool ex_handler_imm_reg(const struct exception_table_entry *fixup, struct pt_regs *regs, int reg, int imm) { *pt_regs_nr(regs, reg) = (long)imm; return ex_handler_default(fixup, regs); } static bool ex_handler_ucopy_len(const struct exception_table_entry *fixup, struct pt_regs *regs, int trapnr, unsigned long fault_address, int reg, int imm) { regs->cx = imm * regs->cx + *pt_regs_nr(regs, reg); return ex_handler_uaccess(fixup, regs, trapnr, fault_address); } int ex_get_fixup_type(unsigned long ip) { const struct exception_table_entry *e = search_exception_tables(ip); return e ? FIELD_GET(EX_DATA_TYPE_MASK, e->data) : EX_TYPE_NONE; } int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code, unsigned long fault_addr) { const struct exception_table_entry *e; int type, reg, imm; #ifdef CONFIG_PNPBIOS if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) { extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp; extern u32 pnp_bios_is_utter_crap; pnp_bios_is_utter_crap = 1; printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n"); __asm__ volatile( "movl %0, %%esp\n\t" "jmp *%1\n\t" : : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip)); panic("do_trap: can't hit this"); } #endif e = search_exception_tables(regs->ip); if (!e) return 0; type = FIELD_GET(EX_DATA_TYPE_MASK, e->data); reg = FIELD_GET(EX_DATA_REG_MASK, e->data); imm = FIELD_GET(EX_DATA_IMM_MASK, e->data); switch (type) { case EX_TYPE_DEFAULT: case EX_TYPE_DEFAULT_MCE_SAFE: return ex_handler_default(e, regs); case EX_TYPE_FAULT: case EX_TYPE_FAULT_MCE_SAFE: return ex_handler_fault(e, regs, trapnr); case EX_TYPE_UACCESS: return ex_handler_uaccess(e, regs, trapnr, fault_addr); case EX_TYPE_COPY: return ex_handler_copy(e, regs, trapnr); case EX_TYPE_CLEAR_FS: return ex_handler_clear_fs(e, regs); case EX_TYPE_FPU_RESTORE: return ex_handler_fprestore(e, regs); case EX_TYPE_BPF: return ex_handler_bpf(e, regs); case EX_TYPE_WRMSR: return ex_handler_msr(e, regs, true, false, reg); case EX_TYPE_RDMSR: return ex_handler_msr(e, regs, false, false, reg); case EX_TYPE_WRMSR_SAFE: return ex_handler_msr(e, regs, true, true, reg); case EX_TYPE_RDMSR_SAFE: return ex_handler_msr(e, regs, false, true, reg); case EX_TYPE_WRMSR_IN_MCE: ex_handler_msr_mce(regs, true); break; case EX_TYPE_RDMSR_IN_MCE: ex_handler_msr_mce(regs, false); break; case EX_TYPE_POP_REG: regs->sp += sizeof(long); fallthrough; case EX_TYPE_IMM_REG: return ex_handler_imm_reg(e, regs, reg, imm); case EX_TYPE_FAULT_SGX: return ex_handler_sgx(e, regs, trapnr); case EX_TYPE_UCOPY_LEN: return ex_handler_ucopy_len(e, regs, trapnr, fault_addr, reg, imm); case EX_TYPE_ZEROPAD: return ex_handler_zeropad(e, regs, fault_addr); } BUG(); } extern unsigned int early_recursion_flag; /* Restricted version used during very early boot */ void __init early_fixup_exception(struct pt_regs *regs, int trapnr) { /* Ignore early NMIs. */ if (trapnr == X86_TRAP_NMI) return; if (early_recursion_flag > 2) goto halt_loop; /* * Old CPUs leave the high bits of CS on the stack * undefined. I'm not sure which CPUs do this, but at least * the 486 DX works this way. * Xen pv domains are not using the default __KERNEL_CS. */ if (!xen_pv_domain() && regs->cs != __KERNEL_CS) goto fail; /* * The full exception fixup machinery is available as soon as * the early IDT is loaded. This means that it is the * responsibility of extable users to either function correctly * when handlers are invoked early or to simply avoid causing * exceptions before they're ready to handle them. * * This is better than filtering which handlers can be used, * because refusing to call a handler here is guaranteed to * result in a hard-to-debug panic. * * Keep in mind that not all vectors actually get here. Early * page faults, for example, are special. */ if (fixup_exception(regs, trapnr, regs->orig_ax, 0)) return; if (trapnr == X86_TRAP_UD) { if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) { /* Skip the ud2. */ regs->ip += LEN_UD2; return; } /* * If this was a BUG and report_bug returns or if this * was just a normal #UD, we want to continue onward and * crash. */ } fail: early_printk("PANIC: early exception 0x%02x IP %lx:%lx error %lx cr2 0x%lx\n", (unsigned)trapnr, (unsigned long)regs->cs, regs->ip, regs->orig_ax, read_cr2()); show_regs(regs); halt_loop: while (true) halt(); } |
1123 1124 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 | // SPDX-License-Identifier: GPL-2.0 /* * This file contains functions which manage high resolution tick * related events. * * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner */ #include <linux/cpu.h> #include <linux/err.h> #include <linux/hrtimer.h> #include <linux/interrupt.h> #include <linux/percpu.h> #include <linux/profile.h> #include <linux/sched.h> #include "tick-internal.h" /** * tick_program_event - program the CPU local timer device for the next event */ int tick_program_event(ktime_t expires, int force) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); if (unlikely(expires == KTIME_MAX)) { /* * We don't need the clock event device any more, stop it. */ clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT_STOPPED); dev->next_event = KTIME_MAX; return 0; } if (unlikely(clockevent_state_oneshot_stopped(dev))) { /* * We need the clock event again, configure it in ONESHOT mode * before using it. */ clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); } return clockevents_program_event(dev, expires, force); } /** * tick_resume_oneshot - resume oneshot mode */ void tick_resume_oneshot(void) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); clockevents_program_event(dev, ktime_get(), true); } /** * tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz) */ void tick_setup_oneshot(struct clock_event_device *newdev, void (*handler)(struct clock_event_device *), ktime_t next_event) { newdev->event_handler = handler; clockevents_switch_state(newdev, CLOCK_EVT_STATE_ONESHOT); clockevents_program_event(newdev, next_event, true); } /** * tick_switch_to_oneshot - switch to oneshot mode */ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) { struct tick_device *td = this_cpu_ptr(&tick_cpu_device); struct clock_event_device *dev = td->evtdev; if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) || !tick_device_is_functional(dev)) { pr_info("Clockevents: could not switch to one-shot mode:"); if (!dev) { pr_cont(" no tick device\n"); } else { if (!tick_device_is_functional(dev)) pr_cont(" %s is not functional.\n", dev->name); else pr_cont(" %s does not support one-shot mode.\n", dev->name); } return -EINVAL; } td->mode = TICKDEV_MODE_ONESHOT; dev->event_handler = handler; clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); tick_broadcast_switch_to_oneshot(); return 0; } /** * tick_oneshot_mode_active - check whether the system is in oneshot mode * * returns 1 when either nohz or highres are enabled. otherwise 0. */ int tick_oneshot_mode_active(void) { unsigned long flags; int ret; local_irq_save(flags); ret = __this_cpu_read(tick_cpu_device.mode) == TICKDEV_MODE_ONESHOT; local_irq_restore(flags); return ret; } #ifdef CONFIG_HIGH_RES_TIMERS /** * tick_init_highres - switch to high resolution mode * * Called with interrupts disabled. */ int tick_init_highres(void) { return tick_switch_to_oneshot(hrtimer_interrupt); } #endif |
190 2 166 166 191 166 1 401 401 3 222 450 54 54 1 1 1 1 1 1 2 2 2 2 2 3 3 4 3 2 1 4 3 2 1 1 1 1 4 3 2 1 1 2 2 2 2 2 2 2 2 2 3 2 2 3 1 1 1 1 2 2 2 2 2 2 4 1 3 3 1 4 1 1 2 2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 | /* * net/tipc/node.c: TIPC node management routines * * Copyright (c) 2000-2006, 2012-2016, Ericsson AB * Copyright (c) 2005-2006, 2010-2014, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "link.h" #include "node.h" #include "name_distr.h" #include "socket.h" #include "bcast.h" #include "monitor.h" #include "discover.h" #include "netlink.h" #include "trace.h" #include "crypto.h" #define INVALID_NODE_SIG 0x10000 #define NODE_CLEANUP_AFTER 300000 /* Flags used to take different actions according to flag type * TIPC_NOTIFY_NODE_DOWN: notify node is down * TIPC_NOTIFY_NODE_UP: notify node is up * TIPC_DISTRIBUTE_NAME: publish or withdraw link state name type */ enum { TIPC_NOTIFY_NODE_DOWN = (1 << 3), TIPC_NOTIFY_NODE_UP = (1 << 4), TIPC_NOTIFY_LINK_UP = (1 << 6), TIPC_NOTIFY_LINK_DOWN = (1 << 7) }; struct tipc_link_entry { struct tipc_link *link; spinlock_t lock; /* per link */ u32 mtu; struct sk_buff_head inputq; struct tipc_media_addr maddr; }; struct tipc_bclink_entry { struct tipc_link *link; struct sk_buff_head inputq1; struct sk_buff_head arrvq; struct sk_buff_head inputq2; struct sk_buff_head namedq; u16 named_rcv_nxt; bool named_open; }; /** * struct tipc_node - TIPC node structure * @addr: network address of node * @kref: reference counter to node object * @lock: rwlock governing access to structure * @net: the applicable net namespace * @hash: links to adjacent nodes in unsorted hash chain * @inputq: pointer to input queue containing messages for msg event * @namedq: pointer to name table input queue with name table messages * @active_links: bearer ids of active links, used as index into links[] array * @links: array containing references to all links to node * @bc_entry: broadcast link entry * @action_flags: bit mask of different types of node actions * @state: connectivity state vs peer node * @preliminary: a preliminary node or not * @failover_sent: failover sent or not * @sync_point: sequence number where synch/failover is finished * @list: links to adjacent nodes in sorted list of cluster's nodes * @working_links: number of working links to node (both active and standby) * @link_cnt: number of links to node * @capabilities: bitmap, indicating peer node's functional capabilities * @signature: node instance identifier * @link_id: local and remote bearer ids of changing link, if any * @peer_id: 128-bit ID of peer * @peer_id_string: ID string of peer * @publ_list: list of publications * @conn_sks: list of connections (FIXME) * @timer: node's keepalive timer * @keepalive_intv: keepalive interval in milliseconds * @rcu: rcu struct for tipc_node * @delete_at: indicates the time for deleting a down node * @peer_net: peer's net namespace * @peer_hash_mix: hash for this peer (FIXME) * @crypto_rx: RX crypto handler */ struct tipc_node { u32 addr; struct kref kref; rwlock_t lock; struct net *net; struct hlist_node hash; int active_links[2]; struct tipc_link_entry links[MAX_BEARERS]; struct tipc_bclink_entry bc_entry; int action_flags; struct list_head list; int state; bool preliminary; bool failover_sent; u16 sync_point; int link_cnt; u16 working_links; u16 capabilities; u32 signature; u32 link_id; u8 peer_id[16]; char peer_id_string[NODE_ID_STR_LEN]; struct list_head publ_list; struct list_head conn_sks; unsigned long keepalive_intv; struct timer_list timer; struct rcu_head rcu; unsigned long delete_at; struct net *peer_net; u32 peer_hash_mix; #ifdef CONFIG_TIPC_CRYPTO struct tipc_crypto *crypto_rx; #endif }; /* Node FSM states and events: */ enum { SELF_DOWN_PEER_DOWN = 0xdd, SELF_UP_PEER_UP = 0xaa, SELF_DOWN_PEER_LEAVING = 0xd1, SELF_UP_PEER_COMING = 0xac, SELF_COMING_PEER_UP = 0xca, SELF_LEAVING_PEER_DOWN = 0x1d, NODE_FAILINGOVER = 0xf0, NODE_SYNCHING = 0xcc }; enum { SELF_ESTABL_CONTACT_EVT = 0xece, SELF_LOST_CONTACT_EVT = 0x1ce, PEER_ESTABL_CONTACT_EVT = 0x9ece, PEER_LOST_CONTACT_EVT = 0x91ce, NODE_FAILOVER_BEGIN_EVT = 0xfbe, NODE_FAILOVER_END_EVT = 0xfee, NODE_SYNCH_BEGIN_EVT = 0xcbe, NODE_SYNCH_END_EVT = 0xcee }; static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id, struct sk_buff_head *xmitq, struct tipc_media_addr **maddr); static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete); static void node_lost_contact(struct tipc_node *n, struct sk_buff_head *inputq); static void tipc_node_delete(struct tipc_node *node); static void tipc_node_timeout(struct timer_list *t); static void tipc_node_fsm_evt(struct tipc_node *n, int evt); static struct tipc_node *tipc_node_find(struct net *net, u32 addr); static struct tipc_node *tipc_node_find_by_id(struct net *net, u8 *id); static bool node_is_up(struct tipc_node *n); static void tipc_node_delete_from_list(struct tipc_node *node); struct tipc_sock_conn { u32 port; u32 peer_port; u32 peer_node; struct list_head list; }; static struct tipc_link *node_active_link(struct tipc_node *n, int sel) { int bearer_id = n->active_links[sel & 1]; if (unlikely(bearer_id == INVALID_BEARER_ID)) return NULL; return n->links[bearer_id].link; } int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel, bool connected) { struct tipc_node *n; int bearer_id; unsigned int mtu = MAX_MSG_SIZE; n = tipc_node_find(net, addr); if (unlikely(!n)) return mtu; /* Allow MAX_MSG_SIZE when building connection oriented message * if they are in the same core network */ if (n->peer_net && connected) { tipc_node_put(n); return mtu; } bearer_id = n->active_links[sel & 1]; if (likely(bearer_id != INVALID_BEARER_ID)) mtu = n->links[bearer_id].mtu; tipc_node_put(n); return mtu; } bool tipc_node_get_id(struct net *net, u32 addr, u8 *id) { u8 *own_id = tipc_own_id(net); struct tipc_node *n; if (!own_id) return true; if (addr == tipc_own_addr(net)) { memcpy(id, own_id, TIPC_NODEID_LEN); return true; } n = tipc_node_find(net, addr); if (!n) return false; memcpy(id, &n->peer_id, TIPC_NODEID_LEN); tipc_node_put(n); return true; } u16 tipc_node_get_capabilities(struct net *net, u32 addr) { struct tipc_node *n; u16 caps; n = tipc_node_find(net, addr); if (unlikely(!n)) return TIPC_NODE_CAPABILITIES; caps = n->capabilities; tipc_node_put(n); return caps; } u32 tipc_node_get_addr(struct tipc_node *node) { return (node) ? node->addr : 0; } char *tipc_node_get_id_str(struct tipc_node *node) { return node->peer_id_string; } #ifdef CONFIG_TIPC_CRYPTO /** * tipc_node_crypto_rx - Retrieve crypto RX handle from node * @__n: target tipc_node * Note: node ref counter must be held first! */ struct tipc_crypto *tipc_node_crypto_rx(struct tipc_node *__n) { return (__n) ? __n->crypto_rx : NULL; } struct tipc_crypto *tipc_node_crypto_rx_by_list(struct list_head *pos) { return container_of(pos, struct tipc_node, list)->crypto_rx; } struct tipc_crypto *tipc_node_crypto_rx_by_addr(struct net *net, u32 addr) { struct tipc_node *n; n = tipc_node_find(net, addr); return (n) ? n->crypto_rx : NULL; } #endif static void tipc_node_free(struct rcu_head *rp) { struct tipc_node *n = container_of(rp, struct tipc_node, rcu); #ifdef CONFIG_TIPC_CRYPTO tipc_crypto_stop(&n->crypto_rx); #endif kfree(n); } static void tipc_node_kref_release(struct kref *kref) { struct tipc_node *n = container_of(kref, struct tipc_node, kref); kfree(n->bc_entry.link); call_rcu(&n->rcu, tipc_node_free); } void tipc_node_put(struct tipc_node *node) { kref_put(&node->kref, tipc_node_kref_release); } void tipc_node_get(struct tipc_node *node) { kref_get(&node->kref); } /* * tipc_node_find - locate specified node object, if it exists */ static struct tipc_node *tipc_node_find(struct net *net, u32 addr) { struct tipc_net *tn = tipc_net(net); struct tipc_node *node; unsigned int thash = tipc_hashfn(addr); rcu_read_lock(); hlist_for_each_entry_rcu(node, &tn->node_htable[thash], hash) { if (node->addr != addr || node->preliminary) continue; if (!kref_get_unless_zero(&node->kref)) node = NULL; break; } rcu_read_unlock(); return node; } /* tipc_node_find_by_id - locate specified node object by its 128-bit id * Note: this function is called only when a discovery request failed * to find the node by its 32-bit id, and is not time critical */ static struct tipc_node *tipc_node_find_by_id(struct net *net, u8 *id) { struct tipc_net *tn = tipc_net(net); struct tipc_node *n; bool found = false; rcu_read_lock(); list_for_each_entry_rcu(n, &tn->node_list, list) { read_lock_bh(&n->lock); if (!memcmp(id, n->peer_id, 16) && kref_get_unless_zero(&n->kref)) found = true; read_unlock_bh(&n->lock); if (found) break; } rcu_read_unlock(); return found ? n : NULL; } static void tipc_node_read_lock(struct tipc_node *n) __acquires(n->lock) { read_lock_bh(&n->lock); } static void tipc_node_read_unlock(struct tipc_node *n) __releases(n->lock) { read_unlock_bh(&n->lock); } static void tipc_node_write_lock(struct tipc_node *n) __acquires(n->lock) { write_lock_bh(&n->lock); } static void tipc_node_write_unlock_fast(struct tipc_node *n) __releases(n->lock) { write_unlock_bh(&n->lock); } static void tipc_node_write_unlock(struct tipc_node *n) __releases(n->lock) { struct tipc_socket_addr sk; struct net *net = n->net; u32 flags = n->action_flags; struct list_head *publ_list; struct tipc_uaddr ua; u32 bearer_id, node; if (likely(!flags)) { write_unlock_bh(&n->lock); return; } tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_NODE_SCOPE, TIPC_LINK_STATE, n->addr, n->addr); sk.ref = n->link_id; sk.node = tipc_own_addr(net); node = n->addr; bearer_id = n->link_id & 0xffff; publ_list = &n->publ_list; n->action_flags &= ~(TIPC_NOTIFY_NODE_DOWN | TIPC_NOTIFY_NODE_UP | TIPC_NOTIFY_LINK_DOWN | TIPC_NOTIFY_LINK_UP); write_unlock_bh(&n->lock); if (flags & TIPC_NOTIFY_NODE_DOWN) tipc_publ_notify(net, publ_list, node, n->capabilities); if (flags & TIPC_NOTIFY_NODE_UP) tipc_named_node_up(net, node, n->capabilities); if (flags & TIPC_NOTIFY_LINK_UP) { tipc_mon_peer_up(net, node, bearer_id); tipc_nametbl_publish(net, &ua, &sk, sk.ref); } if (flags & TIPC_NOTIFY_LINK_DOWN) { tipc_mon_peer_down(net, node, bearer_id); tipc_nametbl_withdraw(net, &ua, &sk, sk.ref); } } static void tipc_node_assign_peer_net(struct tipc_node *n, u32 hash_mixes) { int net_id = tipc_netid(n->net); struct tipc_net *tn_peer; struct net *tmp; u32 hash_chk; if (n->peer_net) return; for_each_net_rcu(tmp) { tn_peer = tipc_net(tmp); if (!tn_peer) continue; /* Integrity checking whether node exists in namespace or not */ if (tn_peer->net_id != net_id) continue; if (memcmp(n->peer_id, tn_peer->node_id, NODE_ID_LEN)) continue; hash_chk = tipc_net_hash_mixes(tmp, tn_peer->random); if (hash_mixes ^ hash_chk) continue; n->peer_net = tmp; n->peer_hash_mix = hash_mixes; break; } } struct tipc_node *tipc_node_create(struct net *net, u32 addr, u8 *peer_id, u16 capabilities, u32 hash_mixes, bool preliminary) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_link *l, *snd_l = tipc_bc_sndlink(net); struct tipc_node *n, *temp_node; unsigned long intv; int bearer_id; int i; spin_lock_bh(&tn->node_list_lock); n = tipc_node_find(net, addr) ?: tipc_node_find_by_id(net, peer_id); if (n) { if (!n->preliminary) goto update; if (preliminary) goto exit; /* A preliminary node becomes "real" now, refresh its data */ tipc_node_write_lock(n); if (!tipc_link_bc_create(net, tipc_own_addr(net), addr, peer_id, U16_MAX, tipc_link_min_win(snd_l), tipc_link_max_win(snd_l), n->capabilities, &n->bc_entry.inputq1, &n->bc_entry.namedq, snd_l, &n->bc_entry.link)) { pr_warn("Broadcast rcv link refresh failed, no memory\n"); tipc_node_write_unlock_fast(n); tipc_node_put(n); n = NULL; goto exit; } n->preliminary = false; n->addr = addr; hlist_del_rcu(&n->hash); hlist_add_head_rcu(&n->hash, &tn->node_htable[tipc_hashfn(addr)]); list_del_rcu(&n->list); list_for_each_entry_rcu(temp_node, &tn->node_list, list) { if (n->addr < temp_node->addr) break; } list_add_tail_rcu(&n->list, &temp_node->list); tipc_node_write_unlock_fast(n); update: if (n->peer_hash_mix ^ hash_mixes) tipc_node_assign_peer_net(n, hash_mixes); if (n->capabilities == capabilities) goto exit; /* Same node may come back with new capabilities */ tipc_node_write_lock(n); n->capabilities = capabilities; for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) { l = n->links[bearer_id].link; if (l) tipc_link_update_caps(l, capabilities); } tipc_node_write_unlock_fast(n); /* Calculate cluster capabilities */ tn->capabilities = TIPC_NODE_CAPABILITIES; list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } tipc_bcast_toggle_rcast(net, (tn->capabilities & TIPC_BCAST_RCAST)); goto exit; } n = kzalloc(sizeof(*n), GFP_ATOMIC); if (!n) { pr_warn("Node creation failed, no memory\n"); goto exit; } tipc_nodeid2string(n->peer_id_string, peer_id); #ifdef CONFIG_TIPC_CRYPTO if (unlikely(tipc_crypto_start(&n->crypto_rx, net, n))) { pr_warn("Failed to start crypto RX(%s)!\n", n->peer_id_string); kfree(n); n = NULL; goto exit; } #endif n->addr = addr; n->preliminary = preliminary; memcpy(&n->peer_id, peer_id, 16); n->net = net; n->peer_net = NULL; n->peer_hash_mix = 0; /* Assign kernel local namespace if exists */ tipc_node_assign_peer_net(n, hash_mixes); n->capabilities = capabilities; kref_init(&n->kref); rwlock_init(&n->lock); INIT_HLIST_NODE(&n->hash); INIT_LIST_HEAD(&n->list); INIT_LIST_HEAD(&n->publ_list); INIT_LIST_HEAD(&n->conn_sks); skb_queue_head_init(&n->bc_entry.namedq); skb_queue_head_init(&n->bc_entry.inputq1); __skb_queue_head_init(&n->bc_entry.arrvq); skb_queue_head_init(&n->bc_entry.inputq2); for (i = 0; i < MAX_BEARERS; i++) spin_lock_init(&n->links[i].lock); n->state = SELF_DOWN_PEER_LEAVING; n->delete_at = jiffies + msecs_to_jiffies(NODE_CLEANUP_AFTER); n->signature = INVALID_NODE_SIG; n->active_links[0] = INVALID_BEARER_ID; n->active_links[1] = INVALID_BEARER_ID; if (!preliminary && !tipc_link_bc_create(net, tipc_own_addr(net), addr, peer_id, U16_MAX, tipc_link_min_win(snd_l), tipc_link_max_win(snd_l), n->capabilities, &n->bc_entry.inputq1, &n->bc_entry.namedq, snd_l, &n->bc_entry.link)) { pr_warn("Broadcast rcv link creation failed, no memory\n"); tipc_node_put(n); n = NULL; goto exit; } tipc_node_get(n); timer_setup(&n->timer, tipc_node_timeout, 0); /* Start a slow timer anyway, crypto needs it */ n->keepalive_intv = 10000; intv = jiffies + msecs_to_jiffies(n->keepalive_intv); if (!mod_timer(&n->timer, intv)) tipc_node_get(n); hlist_add_head_rcu(&n->hash, &tn->node_htable[tipc_hashfn(addr)]); list_for_each_entry_rcu(temp_node, &tn->node_list, list) { if (n->addr < temp_node->addr) break; } list_add_tail_rcu(&n->list, &temp_node->list); /* Calculate cluster capabilities */ tn->capabilities = TIPC_NODE_CAPABILITIES; list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } tipc_bcast_toggle_rcast(net, (tn->capabilities & TIPC_BCAST_RCAST)); trace_tipc_node_create(n, true, " "); exit: spin_unlock_bh(&tn->node_list_lock); return n; } static void tipc_node_calculate_timer(struct tipc_node *n, struct tipc_link *l) { unsigned long tol = tipc_link_tolerance(l); unsigned long intv = ((tol / 4) > 500) ? 500 : tol / 4; /* Link with lowest tolerance determines timer interval */ if (intv < n->keepalive_intv) n->keepalive_intv = intv; /* Ensure link's abort limit corresponds to current tolerance */ tipc_link_set_abort_limit(l, tol / n->keepalive_intv); } static void tipc_node_delete_from_list(struct tipc_node *node) { #ifdef CONFIG_TIPC_CRYPTO tipc_crypto_key_flush(node->crypto_rx); #endif list_del_rcu(&node->list); hlist_del_rcu(&node->hash); tipc_node_put(node); } static void tipc_node_delete(struct tipc_node *node) { trace_tipc_node_delete(node, true, " "); tipc_node_delete_from_list(node); del_timer_sync(&node->timer); tipc_node_put(node); } void tipc_node_stop(struct net *net) { struct tipc_net *tn = tipc_net(net); struct tipc_node *node, *t_node; spin_lock_bh(&tn->node_list_lock); list_for_each_entry_safe(node, t_node, &tn->node_list, list) tipc_node_delete(node); spin_unlock_bh(&tn->node_list_lock); } void tipc_node_subscribe(struct net *net, struct list_head *subscr, u32 addr) { struct tipc_node *n; if (in_own_node(net, addr)) return; n = tipc_node_find(net, addr); if (!n) { pr_warn("Node subscribe rejected, unknown node 0x%x\n", addr); return; } tipc_node_write_lock(n); list_add_tail(subscr, &n->publ_list); tipc_node_write_unlock_fast(n); tipc_node_put(n); } void tipc_node_unsubscribe(struct net *net, struct list_head *subscr, u32 addr) { struct tipc_node *n; if (in_own_node(net, addr)) return; n = tipc_node_find(net, addr); if (!n) { pr_warn("Node unsubscribe rejected, unknown node 0x%x\n", addr); return; } tipc_node_write_lock(n); list_del_init(subscr); tipc_node_write_unlock_fast(n); tipc_node_put(n); } int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port) { struct tipc_node *node; struct tipc_sock_conn *conn; int err = 0; if (in_own_node(net, dnode)) return 0; node = tipc_node_find(net, dnode); if (!node) { pr_warn("Connecting sock to node 0x%x failed\n", dnode); return -EHOSTUNREACH; } conn = kmalloc(sizeof(*conn), GFP_ATOMIC); if (!conn) { err = -EHOSTUNREACH; goto exit; } conn->peer_node = dnode; conn->port = port; conn->peer_port = peer_port; tipc_node_write_lock(node); list_add_tail(&conn->list, &node->conn_sks); tipc_node_write_unlock(node); exit: tipc_node_put(node); return err; } void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port) { struct tipc_node *node; struct tipc_sock_conn *conn, *safe; if (in_own_node(net, dnode)) return; node = tipc_node_find(net, dnode); if (!node) return; tipc_node_write_lock(node); list_for_each_entry_safe(conn, safe, &node->conn_sks, list) { if (port != conn->port) continue; list_del(&conn->list); kfree(conn); } tipc_node_write_unlock(node); tipc_node_put(node); } static void tipc_node_clear_links(struct tipc_node *node) { int i; for (i = 0; i < MAX_BEARERS; i++) { struct tipc_link_entry *le = &node->links[i]; if (le->link) { kfree(le->link); le->link = NULL; node->link_cnt--; } } } /* tipc_node_cleanup - delete nodes that does not * have active links for NODE_CLEANUP_AFTER time */ static bool tipc_node_cleanup(struct tipc_node *peer) { struct tipc_node *temp_node; struct tipc_net *tn = tipc_net(peer->net); bool deleted = false; /* If lock held by tipc_node_stop() the node will be deleted anyway */ if (!spin_trylock_bh(&tn->node_list_lock)) return false; tipc_node_write_lock(peer); if (!node_is_up(peer) && time_after(jiffies, peer->delete_at)) { tipc_node_clear_links(peer); tipc_node_delete_from_list(peer); deleted = true; } tipc_node_write_unlock(peer); if (!deleted) { spin_unlock_bh(&tn->node_list_lock); return deleted; } /* Calculate cluster capabilities */ tn->capabilities = TIPC_NODE_CAPABILITIES; list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } tipc_bcast_toggle_rcast(peer->net, (tn->capabilities & TIPC_BCAST_RCAST)); spin_unlock_bh(&tn->node_list_lock); return deleted; } /* tipc_node_timeout - handle expiration of node timer */ static void tipc_node_timeout(struct timer_list *t) { struct tipc_node *n = from_timer(n, t, timer); struct tipc_link_entry *le; struct sk_buff_head xmitq; int remains = n->link_cnt; int bearer_id; int rc = 0; trace_tipc_node_timeout(n, false, " "); if (!node_is_up(n) && tipc_node_cleanup(n)) { /*Removing the reference of Timer*/ tipc_node_put(n); return; } #ifdef CONFIG_TIPC_CRYPTO /* Take any crypto key related actions first */ tipc_crypto_timeout(n->crypto_rx); #endif __skb_queue_head_init(&xmitq); /* Initial node interval to value larger (10 seconds), then it will be * recalculated with link lowest tolerance */ tipc_node_read_lock(n); n->keepalive_intv = 10000; tipc_node_read_unlock(n); for (bearer_id = 0; remains && (bearer_id < MAX_BEARERS); bearer_id++) { tipc_node_read_lock(n); le = &n->links[bearer_id]; if (le->link) { spin_lock_bh(&le->lock); /* Link tolerance may change asynchronously: */ tipc_node_calculate_timer(n, le->link); rc = tipc_link_timeout(le->link, &xmitq); spin_unlock_bh(&le->lock); remains--; } tipc_node_read_unlock(n); tipc_bearer_xmit(n->net, bearer_id, &xmitq, &le->maddr, n); if (rc & TIPC_LINK_DOWN_EVT) tipc_node_link_down(n, bearer_id, false); } mod_timer(&n->timer, jiffies + msecs_to_jiffies(n->keepalive_intv)); } /** * __tipc_node_link_up - handle addition of link * @n: target tipc_node * @bearer_id: id of the bearer * @xmitq: queue for messages to be xmited on * Node lock must be held by caller * Link becomes active (alone or shared) or standby, depending on its priority. */ static void __tipc_node_link_up(struct tipc_node *n, int bearer_id, struct sk_buff_head *xmitq) { int *slot0 = &n->active_links[0]; int *slot1 = &n->active_links[1]; struct tipc_link *ol = node_active_link(n, 0); struct tipc_link *nl = n->links[bearer_id].link; if (!nl || tipc_link_is_up(nl)) return; tipc_link_fsm_evt(nl, LINK_ESTABLISH_EVT); if (!tipc_link_is_up(nl)) return; n->working_links++; n->action_flags |= TIPC_NOTIFY_LINK_UP; n->link_id = tipc_link_id(nl); /* Leave room for tunnel header when returning 'mtu' to users: */ n->links[bearer_id].mtu = tipc_link_mss(nl); tipc_bearer_add_dest(n->net, bearer_id, n->addr); tipc_bcast_inc_bearer_dst_cnt(n->net, bearer_id); pr_debug("Established link <%s> on network plane %c\n", tipc_link_name(nl), tipc_link_plane(nl)); trace_tipc_node_link_up(n, true, " "); /* Ensure that a STATE message goes first */ tipc_link_build_state_msg(nl, xmitq); /* First link? => give it both slots */ if (!ol) { *slot0 = bearer_id; *slot1 = bearer_id; tipc_node_fsm_evt(n, SELF_ESTABL_CONTACT_EVT); n->action_flags |= TIPC_NOTIFY_NODE_UP; tipc_link_set_active(nl, true); tipc_bcast_add_peer(n->net, nl, xmitq); return; } /* Second link => redistribute slots */ if (tipc_link_prio(nl) > tipc_link_prio(ol)) { pr_debug("Old link <%s> becomes standby\n", tipc_link_name(ol)); *slot0 = bearer_id; *slot1 = bearer_id; tipc_link_set_active(nl, true); tipc_link_set_active(ol, false); } else if (tipc_link_prio(nl) == tipc_link_prio(ol)) { tipc_link_set_active(nl, true); *slot1 = bearer_id; } else { pr_debug("New link <%s> is standby\n", tipc_link_name(nl)); } /* Prepare synchronization with first link */ tipc_link_tnl_prepare(ol, nl, SYNCH_MSG, xmitq); } /** * tipc_node_link_up - handle addition of link * @n: target tipc_node * @bearer_id: id of the bearer * @xmitq: queue for messages to be xmited on * * Link becomes active (alone or shared) or standby, depending on its priority. */ static void tipc_node_link_up(struct tipc_node *n, int bearer_id, struct sk_buff_head *xmitq) { struct tipc_media_addr *maddr; tipc_node_write_lock(n); __tipc_node_link_up(n, bearer_id, xmitq); maddr = &n->links[bearer_id].maddr; tipc_bearer_xmit(n->net, bearer_id, xmitq, maddr, n); tipc_node_write_unlock(n); } /** * tipc_node_link_failover() - start failover in case "half-failover" * * This function is only called in a very special situation where link * failover can be already started on peer node but not on this node. * This can happen when e.g.:: * * 1. Both links <1A-2A>, <1B-2B> down * 2. Link endpoint 2A up, but 1A still down (e.g. due to network * disturbance, wrong session, etc.) * 3. Link <1B-2B> up * 4. Link endpoint 2A down (e.g. due to link tolerance timeout) * 5. Node 2 starts failover onto link <1B-2B> * * ==> Node 1 does never start link/node failover! * * @n: tipc node structure * @l: link peer endpoint failingover (- can be NULL) * @tnl: tunnel link * @xmitq: queue for messages to be xmited on tnl link later */ static void tipc_node_link_failover(struct tipc_node *n, struct tipc_link *l, struct tipc_link *tnl, struct sk_buff_head *xmitq) { /* Avoid to be "self-failover" that can never end */ if (!tipc_link_is_up(tnl)) return; /* Don't rush, failure link may be in the process of resetting */ if (l && !tipc_link_is_reset(l)) return; tipc_link_fsm_evt(tnl, LINK_SYNCH_END_EVT); tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT); n->sync_point = tipc_link_rcv_nxt(tnl) + (U16_MAX / 2 - 1); tipc_link_failover_prepare(l, tnl, xmitq); if (l) tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT); tipc_node_fsm_evt(n, NODE_FAILOVER_BEGIN_EVT); } /** * __tipc_node_link_down - handle loss of link * @n: target tipc_node * @bearer_id: id of the bearer * @xmitq: queue for messages to be xmited on * @maddr: output media address of the bearer */ static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id, struct sk_buff_head *xmitq, struct tipc_media_addr **maddr) { struct tipc_link_entry *le = &n->links[*bearer_id]; int *slot0 = &n->active_links[0]; int *slot1 = &n->active_links[1]; int i, highest = 0, prio; struct tipc_link *l, *_l, *tnl; l = n->links[*bearer_id].link; if (!l || tipc_link_is_reset(l)) return; n->working_links--; n->action_flags |= TIPC_NOTIFY_LINK_DOWN; n->link_id = tipc_link_id(l); tipc_bearer_remove_dest(n->net, *bearer_id, n->addr); pr_debug("Lost link <%s> on network plane %c\n", tipc_link_name(l), tipc_link_plane(l)); /* Select new active link if any available */ *slot0 = INVALID_BEARER_ID; *slot1 = INVALID_BEARER_ID; for (i = 0; i < MAX_BEARERS; i++) { _l = n->links[i].link; if (!_l || !tipc_link_is_up(_l)) continue; if (_l == l) continue; prio = tipc_link_prio(_l); if (prio < highest) continue; if (prio > highest) { highest = prio; *slot0 = i; *slot1 = i; continue; } *slot1 = i; } if (!node_is_up(n)) { if (tipc_link_peer_is_down(l)) tipc_node_fsm_evt(n, PEER_LOST_CONTACT_EVT); tipc_node_fsm_evt(n, SELF_LOST_CONTACT_EVT); trace_tipc_link_reset(l, TIPC_DUMP_ALL, "link down!"); tipc_link_fsm_evt(l, LINK_RESET_EVT); tipc_link_reset(l); tipc_link_build_reset_msg(l, xmitq); *maddr = &n->links[*bearer_id].maddr; node_lost_contact(n, &le->inputq); tipc_bcast_dec_bearer_dst_cnt(n->net, *bearer_id); return; } tipc_bcast_dec_bearer_dst_cnt(n->net, *bearer_id); /* There is still a working link => initiate failover */ *bearer_id = n->active_links[0]; tnl = n->links[*bearer_id].link; tipc_link_fsm_evt(tnl, LINK_SYNCH_END_EVT); tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT); n->sync_point = tipc_link_rcv_nxt(tnl) + (U16_MAX / 2 - 1); tipc_link_tnl_prepare(l, tnl, FAILOVER_MSG, xmitq); trace_tipc_link_reset(l, TIPC_DUMP_ALL, "link down -> failover!"); tipc_link_reset(l); tipc_link_fsm_evt(l, LINK_RESET_EVT); tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT); tipc_node_fsm_evt(n, NODE_FAILOVER_BEGIN_EVT); *maddr = &n->links[*bearer_id].maddr; } static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete) { struct tipc_link_entry *le = &n->links[bearer_id]; struct tipc_media_addr *maddr = NULL; struct tipc_link *l = le->link; int old_bearer_id = bearer_id; struct sk_buff_head xmitq; if (!l) return; __skb_queue_head_init(&xmitq); tipc_node_write_lock(n); if (!tipc_link_is_establishing(l)) { __tipc_node_link_down(n, &bearer_id, &xmitq, &maddr); } else { /* Defuse pending tipc_node_link_up() */ tipc_link_reset(l); tipc_link_fsm_evt(l, LINK_RESET_EVT); } if (delete) { kfree(l); le->link = NULL; n->link_cnt--; } trace_tipc_node_link_down(n, true, "node link down or deleted!"); tipc_node_write_unlock(n); if (delete) tipc_mon_remove_peer(n->net, n->addr, old_bearer_id); if (!skb_queue_empty(&xmitq)) tipc_bearer_xmit(n->net, bearer_id, &xmitq, maddr, n); tipc_sk_rcv(n->net, &le->inputq); } static bool node_is_up(struct tipc_node *n) { return n->active_links[0] != INVALID_BEARER_ID; } bool tipc_node_is_up(struct net *net, u32 addr) { struct tipc_node *n; bool retval = false; if (in_own_node(net, addr)) return true; n = tipc_node_find(net, addr); if (!n) return false; retval = node_is_up(n); tipc_node_put(n); return retval; } static u32 tipc_node_suggest_addr(struct net *net, u32 addr) { struct tipc_node *n; addr ^= tipc_net(net)->random; while ((n = tipc_node_find(net, addr))) { tipc_node_put(n); addr++; } return addr; } /* tipc_node_try_addr(): Check if addr can be used by peer, suggest other if not * Returns suggested address if any, otherwise 0 */ u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr) { struct tipc_net *tn = tipc_net(net); struct tipc_node *n; bool preliminary; u32 sugg_addr; /* Suggest new address if some other peer is using this one */ n = tipc_node_find(net, addr); if (n) { if (!memcmp(n->peer_id, id, NODE_ID_LEN)) addr = 0; tipc_node_put(n); if (!addr) return 0; return tipc_node_suggest_addr(net, addr); } /* Suggest previously used address if peer is known */ n = tipc_node_find_by_id(net, id); if (n) { sugg_addr = n->addr; preliminary = n->preliminary; tipc_node_put(n); if (!preliminary) return sugg_addr; } /* Even this node may be in conflict */ if (tn->trial_addr == addr) return tipc_node_suggest_addr(net, addr); return 0; } void tipc_node_check_dest(struct net *net, u32 addr, u8 *peer_id, struct tipc_bearer *b, u16 capabilities, u32 signature, u32 hash_mixes, struct tipc_media_addr *maddr, bool *respond, bool *dupl_addr) { struct tipc_node *n; struct tipc_link *l; struct tipc_link_entry *le; bool addr_match = false; bool sign_match = false; bool link_up = false; bool link_is_reset = false; bool accept_addr = false; bool reset = false; char *if_name; unsigned long intv; u16 session; *dupl_addr = false; *respond = false; n = tipc_node_create(net, addr, peer_id, capabilities, hash_mixes, false); if (!n) return; tipc_node_write_lock(n); le = &n->links[b->identity]; /* Prepare to validate requesting node's signature and media address */ l = le->link; link_up = l && tipc_link_is_up(l); link_is_reset = l && tipc_link_is_reset(l); addr_match = l && !memcmp(&le->maddr, maddr, sizeof(*maddr)); sign_match = (signature == n->signature); /* These three flags give us eight permutations: */ if (sign_match && addr_match && link_up) { /* All is fine. Ignore requests. */ /* Peer node is not a container/local namespace */ if (!n->peer_hash_mix) n->peer_hash_mix = hash_mixes; } else if (sign_match && addr_match && !link_up) { /* Respond. The link will come up in due time */ *respond = true; } else if (sign_match && !addr_match && link_up) { /* Peer has changed i/f address without rebooting. * If so, the link will reset soon, and the next * discovery will be accepted. So we can ignore it. * It may also be a cloned or malicious peer having * chosen the same node address and signature as an * existing one. * Ignore requests until the link goes down, if ever. */ *dupl_addr = true; } else if (sign_match && !addr_match && !link_up) { /* Peer link has changed i/f address without rebooting. * It may also be a cloned or malicious peer; we can't * distinguish between the two. * The signature is correct, so we must accept. */ accept_addr = true; *respond = true; reset = true; } else if (!sign_match && addr_match && link_up) { /* Peer node rebooted. Two possibilities: * - Delayed re-discovery; this link endpoint has already * reset and re-established contact with the peer, before * receiving a discovery message from that node. * (The peer happened to receive one from this node first). * - The peer came back so fast that our side has not * discovered it yet. Probing from this side will soon * reset the link, since there can be no working link * endpoint at the peer end, and the link will re-establish. * Accept the signature, since it comes from a known peer. */ n->signature = signature; } else if (!sign_match && addr_match && !link_up) { /* The peer node has rebooted. * Accept signature, since it is a known peer. */ n->signature = signature; *respond = true; } else if (!sign_match && !addr_match && link_up) { /* Peer rebooted with new address, or a new/duplicate peer. * Ignore until the link goes down, if ever. */ *dupl_addr = true; } else if (!sign_match && !addr_match && !link_up) { /* Peer rebooted with new address, or it is a new peer. * Accept signature and address. */ n->signature = signature; accept_addr = true; *respond = true; reset = true; } if (!accept_addr) goto exit; /* Now create new link if not already existing */ if (!l) { if (n->link_cnt == 2) goto exit; if_name = strchr(b->name, ':') + 1; get_random_bytes(&session, sizeof(u16)); if (!tipc_link_create(net, if_name, b->identity, b->tolerance, b->net_plane, b->mtu, b->priority, b->min_win, b->max_win, session, tipc_own_addr(net), addr, peer_id, n->capabilities, tipc_bc_sndlink(n->net), n->bc_entry.link, &le->inputq, &n->bc_entry.namedq, &l)) { *respond = false; goto exit; } trace_tipc_link_reset(l, TIPC_DUMP_ALL, "link created!"); tipc_link_reset(l); tipc_link_fsm_evt(l, LINK_RESET_EVT); if (n->state == NODE_FAILINGOVER) tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT); link_is_reset = tipc_link_is_reset(l); le->link = l; n->link_cnt++; tipc_node_calculate_timer(n, l); if (n->link_cnt == 1) { intv = jiffies + msecs_to_jiffies(n->keepalive_intv); if (!mod_timer(&n->timer, intv)) tipc_node_get(n); } } memcpy(&le->maddr, maddr, sizeof(*maddr)); exit: tipc_node_write_unlock(n); if (reset && !link_is_reset) tipc_node_link_down(n, b->identity, false); tipc_node_put(n); } void tipc_node_delete_links(struct net *net, int bearer_id) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_node *n; rcu_read_lock(); list_for_each_entry_rcu(n, &tn->node_list, list) { tipc_node_link_down(n, bearer_id, true); } rcu_read_unlock(); } static void tipc_node_reset_links(struct tipc_node *n) { int i; pr_warn("Resetting all links to %x\n", n->addr); trace_tipc_node_reset_links(n, true, " "); for (i = 0; i < MAX_BEARERS; i++) { tipc_node_link_down(n, i, false); } } /* tipc_node_fsm_evt - node finite state machine * Determines when contact is allowed with peer node */ static void tipc_node_fsm_evt(struct tipc_node *n, int evt) { int state = n->state; switch (state) { case SELF_DOWN_PEER_DOWN: switch (evt) { case SELF_ESTABL_CONTACT_EVT: state = SELF_UP_PEER_COMING; break; case PEER_ESTABL_CONTACT_EVT: state = SELF_COMING_PEER_UP; break; case SELF_LOST_CONTACT_EVT: case PEER_LOST_CONTACT_EVT: break; case NODE_SYNCH_END_EVT: case NODE_SYNCH_BEGIN_EVT: case NODE_FAILOVER_BEGIN_EVT: case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; case SELF_UP_PEER_UP: switch (evt) { case SELF_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_LEAVING; break; case PEER_LOST_CONTACT_EVT: state = SELF_LEAVING_PEER_DOWN; break; case NODE_SYNCH_BEGIN_EVT: state = NODE_SYNCHING; break; case NODE_FAILOVER_BEGIN_EVT: state = NODE_FAILINGOVER; break; case SELF_ESTABL_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: case NODE_SYNCH_END_EVT: case NODE_FAILOVER_END_EVT: break; default: goto illegal_evt; } break; case SELF_DOWN_PEER_LEAVING: switch (evt) { case PEER_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_DOWN; break; case SELF_ESTABL_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: case SELF_LOST_CONTACT_EVT: break; case NODE_SYNCH_END_EVT: case NODE_SYNCH_BEGIN_EVT: case NODE_FAILOVER_BEGIN_EVT: case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; case SELF_UP_PEER_COMING: switch (evt) { case PEER_ESTABL_CONTACT_EVT: state = SELF_UP_PEER_UP; break; case SELF_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_DOWN; break; case SELF_ESTABL_CONTACT_EVT: case PEER_LOST_CONTACT_EVT: case NODE_SYNCH_END_EVT: case NODE_FAILOVER_BEGIN_EVT: break; case NODE_SYNCH_BEGIN_EVT: case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; case SELF_COMING_PEER_UP: switch (evt) { case SELF_ESTABL_CONTACT_EVT: state = SELF_UP_PEER_UP; break; case PEER_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_DOWN; break; case SELF_LOST_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: break; case NODE_SYNCH_END_EVT: case NODE_SYNCH_BEGIN_EVT: case NODE_FAILOVER_BEGIN_EVT: case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; case SELF_LEAVING_PEER_DOWN: switch (evt) { case SELF_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_DOWN; break; case SELF_ESTABL_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: case PEER_LOST_CONTACT_EVT: break; case NODE_SYNCH_END_EVT: case NODE_SYNCH_BEGIN_EVT: case NODE_FAILOVER_BEGIN_EVT: case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; case NODE_FAILINGOVER: switch (evt) { case SELF_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_LEAVING; break; case PEER_LOST_CONTACT_EVT: state = SELF_LEAVING_PEER_DOWN; break; case NODE_FAILOVER_END_EVT: state = SELF_UP_PEER_UP; break; case NODE_FAILOVER_BEGIN_EVT: case SELF_ESTABL_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: break; case NODE_SYNCH_BEGIN_EVT: case NODE_SYNCH_END_EVT: default: goto illegal_evt; } break; case NODE_SYNCHING: switch (evt) { case SELF_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_LEAVING; break; case PEER_LOST_CONTACT_EVT: state = SELF_LEAVING_PEER_DOWN; break; case NODE_SYNCH_END_EVT: state = SELF_UP_PEER_UP; break; case NODE_FAILOVER_BEGIN_EVT: state = NODE_FAILINGOVER; break; case NODE_SYNCH_BEGIN_EVT: case SELF_ESTABL_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: break; case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; default: pr_err("Unknown node fsm state %x\n", state); break; } trace_tipc_node_fsm(n->peer_id, n->state, state, evt); n->state = state; return; illegal_evt: pr_err("Illegal node fsm evt %x in state %x\n", evt, state); trace_tipc_node_fsm(n->peer_id, n->state, state, evt); } static void node_lost_contact(struct tipc_node *n, struct sk_buff_head *inputq) { struct tipc_sock_conn *conn, *safe; struct tipc_link *l; struct list_head *conns = &n->conn_sks; struct sk_buff *skb; uint i; pr_debug("Lost contact with %x\n", n->addr); n->delete_at = jiffies + msecs_to_jiffies(NODE_CLEANUP_AFTER); trace_tipc_node_lost_contact(n, true, " "); /* Clean up broadcast state */ tipc_bcast_remove_peer(n->net, n->bc_entry.link); skb_queue_purge(&n->bc_entry.namedq); /* Abort any ongoing link failover */ for (i = 0; i < MAX_BEARERS; i++) { l = n->links[i].link; if (l) tipc_link_fsm_evt(l, LINK_FAILOVER_END_EVT); } /* Notify publications from this node */ n->action_flags |= TIPC_NOTIFY_NODE_DOWN; n->peer_net = NULL; n->peer_hash_mix = 0; /* Notify sockets connected to node */ list_for_each_entry_safe(conn, safe, conns, list) { skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG, SHORT_H_SIZE, 0, tipc_own_addr(n->net), conn->peer_node, conn->port, conn->peer_port, TIPC_ERR_NO_NODE); if (likely(skb)) skb_queue_tail(inputq, skb); list_del(&conn->list); kfree(conn); } } /** * tipc_node_get_linkname - get the name of a link * * @net: the applicable net namespace * @bearer_id: id of the bearer * @addr: peer node address * @linkname: link name output buffer * @len: size of @linkname output buffer * * Return: 0 on success */ int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 addr, char *linkname, size_t len) { struct tipc_link *link; int err = -EINVAL; struct tipc_node *node = tipc_node_find(net, addr); if (!node) return err; if (bearer_id >= MAX_BEARERS) goto exit; tipc_node_read_lock(node); link = node->links[bearer_id].link; if (link) { strncpy(linkname, tipc_link_name(link), len); err = 0; } tipc_node_read_unlock(node); exit: tipc_node_put(node); return err; } /* Caller should hold node lock for the passed node */ static int __tipc_nl_add_node(struct tipc_nl_msg *msg, struct tipc_node *node) { void *hdr; struct nlattr *attrs; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_NODE_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_NODE); if (!attrs) goto msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_NODE_ADDR, node->addr)) goto attr_msg_full; if (node_is_up(node)) if (nla_put_flag(msg->skb, TIPC_NLA_NODE_UP)) goto attr_msg_full; nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); return 0; attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; } static void tipc_lxc_xmit(struct net *peer_net, struct sk_buff_head *list) { struct tipc_msg *hdr = buf_msg(skb_peek(list)); struct sk_buff_head inputq; switch (msg_user(hdr)) { case TIPC_LOW_IMPORTANCE: case TIPC_MEDIUM_IMPORTANCE: case TIPC_HIGH_IMPORTANCE: case TIPC_CRITICAL_IMPORTANCE: if (msg_connected(hdr) || msg_named(hdr) || msg_direct(hdr)) { tipc_loopback_trace(peer_net, list); spin_lock_init(&list->lock); tipc_sk_rcv(peer_net, list); return; } if (msg_mcast(hdr)) { tipc_loopback_trace(peer_net, list); skb_queue_head_init(&inputq); tipc_sk_mcast_rcv(peer_net, list, &inputq); __skb_queue_purge(list); skb_queue_purge(&inputq); return; } return; case MSG_FRAGMENTER: if (tipc_msg_assemble(list)) { tipc_loopback_trace(peer_net, list); skb_queue_head_init(&inputq); tipc_sk_mcast_rcv(peer_net, list, &inputq); __skb_queue_purge(list); skb_queue_purge(&inputq); } return; case GROUP_PROTOCOL: case CONN_MANAGER: tipc_loopback_trace(peer_net, list); spin_lock_init(&list->lock); tipc_sk_rcv(peer_net, list); return; case LINK_PROTOCOL: case NAME_DISTRIBUTOR: case TUNNEL_PROTOCOL: case BCAST_PROTOCOL: return; default: return; } } /** * tipc_node_xmit() - general link level function for message sending * @net: the applicable net namespace * @list: chain of buffers containing message * @dnode: address of destination node * @selector: a number used for deterministic link selection * Consumes the buffer chain. * Return: 0 if success, otherwise: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE,-ENOBUF */ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, u32 dnode, int selector) { struct tipc_link_entry *le = NULL; struct tipc_node *n; struct sk_buff_head xmitq; bool node_up = false; struct net *peer_net; int bearer_id; int rc; if (in_own_node(net, dnode)) { tipc_loopback_trace(net, list); spin_lock_init(&list->lock); tipc_sk_rcv(net, list); return 0; } n = tipc_node_find(net, dnode); if (unlikely(!n)) { __skb_queue_purge(list); return -EHOSTUNREACH; } rcu_read_lock(); tipc_node_read_lock(n); node_up = node_is_up(n); peer_net = n->peer_net; tipc_node_read_unlock(n); if (node_up && peer_net && check_net(peer_net)) { /* xmit inner linux container */ tipc_lxc_xmit(peer_net, list); if (likely(skb_queue_empty(list))) { rcu_read_unlock(); tipc_node_put(n); return 0; } } rcu_read_unlock(); tipc_node_read_lock(n); bearer_id = n->active_links[selector & 1]; if (unlikely(bearer_id == INVALID_BEARER_ID)) { tipc_node_read_unlock(n); tipc_node_put(n); __skb_queue_purge(list); return -EHOSTUNREACH; } __skb_queue_head_init(&xmitq); le = &n->links[bearer_id]; spin_lock_bh(&le->lock); rc = tipc_link_xmit(le->link, list, &xmitq); spin_unlock_bh(&le->lock); tipc_node_read_unlock(n); if (unlikely(rc == -ENOBUFS)) tipc_node_link_down(n, bearer_id, false); else tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr, n); tipc_node_put(n); return rc; } /* tipc_node_xmit_skb(): send single buffer to destination * Buffers sent via this function are generally TIPC_SYSTEM_IMPORTANCE * messages, which will not be rejected * The only exception is datagram messages rerouted after secondary * lookup, which are rare and safe to dispose of anyway. */ int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode, u32 selector) { struct sk_buff_head head; __skb_queue_head_init(&head); __skb_queue_tail(&head, skb); tipc_node_xmit(net, &head, dnode, selector); return 0; } /* tipc_node_distr_xmit(): send single buffer msgs to individual destinations * Note: this is only for SYSTEM_IMPORTANCE messages, which cannot be rejected */ int tipc_node_distr_xmit(struct net *net, struct sk_buff_head *xmitq) { struct sk_buff *skb; u32 selector, dnode; while ((skb = __skb_dequeue(xmitq))) { selector = msg_origport(buf_msg(skb)); dnode = msg_destnode(buf_msg(skb)); tipc_node_xmit_skb(net, skb, dnode, selector); } return 0; } void tipc_node_broadcast(struct net *net, struct sk_buff *skb, int rc_dests) { struct sk_buff_head xmitq; struct sk_buff *txskb; struct tipc_node *n; u16 dummy; u32 dst; /* Use broadcast if all nodes support it */ if (!rc_dests && tipc_bcast_get_mode(net) != BCLINK_MODE_RCAST) { __skb_queue_head_init(&xmitq); __skb_queue_tail(&xmitq, skb); tipc_bcast_xmit(net, &xmitq, &dummy); return; } /* Otherwise use legacy replicast method */ rcu_read_lock(); list_for_each_entry_rcu(n, tipc_nodes(net), list) { dst = n->addr; if (in_own_node(net, dst)) continue; if (!node_is_up(n)) continue; txskb = pskb_copy(skb, GFP_ATOMIC); if (!txskb) break; msg_set_destnode(buf_msg(txskb), dst); tipc_node_xmit_skb(net, txskb, dst, 0); } rcu_read_unlock(); kfree_skb(skb); } static void tipc_node_mcast_rcv(struct tipc_node *n) { struct tipc_bclink_entry *be = &n->bc_entry; /* 'arrvq' is under inputq2's lock protection */ spin_lock_bh(&be->inputq2.lock); spin_lock_bh(&be->inputq1.lock); skb_queue_splice_tail_init(&be->inputq1, &be->arrvq); spin_unlock_bh(&be->inputq1.lock); spin_unlock_bh(&be->inputq2.lock); tipc_sk_mcast_rcv(n->net, &be->arrvq, &be->inputq2); } static void tipc_node_bc_sync_rcv(struct tipc_node *n, struct tipc_msg *hdr, int bearer_id, struct sk_buff_head *xmitq) { struct tipc_link *ucl; int rc; rc = tipc_bcast_sync_rcv(n->net, n->bc_entry.link, hdr, xmitq); if (rc & TIPC_LINK_DOWN_EVT) { tipc_node_reset_links(n); return; } if (!(rc & TIPC_LINK_SND_STATE)) return; /* If probe message, a STATE response will be sent anyway */ if (msg_probe(hdr)) return; /* Produce a STATE message carrying broadcast NACK */ tipc_node_read_lock(n); ucl = n->links[bearer_id].link; if (ucl) tipc_link_build_state_msg(ucl, xmitq); tipc_node_read_unlock(n); } /** * tipc_node_bc_rcv - process TIPC broadcast packet arriving from off-node * @net: the applicable net namespace * @skb: TIPC packet * @bearer_id: id of bearer message arrived on * * Invoked with no locks held. */ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id) { int rc; struct sk_buff_head xmitq; struct tipc_bclink_entry *be; struct tipc_link_entry *le; struct tipc_msg *hdr = buf_msg(skb); int usr = msg_user(hdr); u32 dnode = msg_destnode(hdr); struct tipc_node *n; __skb_queue_head_init(&xmitq); /* If NACK for other node, let rcv link for that node peek into it */ if ((usr == BCAST_PROTOCOL) && (dnode != tipc_own_addr(net))) n = tipc_node_find(net, dnode); else n = tipc_node_find(net, msg_prevnode(hdr)); if (!n) { kfree_skb(skb); return; } be = &n->bc_entry; le = &n->links[bearer_id]; rc = tipc_bcast_rcv(net, be->link, skb); /* Broadcast ACKs are sent on a unicast link */ if (rc & TIPC_LINK_SND_STATE) { tipc_node_read_lock(n); tipc_link_build_state_msg(le->link, &xmitq); tipc_node_read_unlock(n); } if (!skb_queue_empty(&xmitq)) tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr, n); if (!skb_queue_empty(&be->inputq1)) tipc_node_mcast_rcv(n); /* Handle NAME_DISTRIBUTOR messages sent from 1.7 nodes */ if (!skb_queue_empty(&n->bc_entry.namedq)) tipc_named_rcv(net, &n->bc_entry.namedq, &n->bc_entry.named_rcv_nxt, &n->bc_entry.named_open); /* If reassembly or retransmission failure => reset all links to peer */ if (rc & TIPC_LINK_DOWN_EVT) tipc_node_reset_links(n); tipc_node_put(n); } /** * tipc_node_check_state - check and if necessary update node state * @n: target tipc_node * @skb: TIPC packet * @bearer_id: identity of bearer delivering the packet * @xmitq: queue for messages to be xmited on * Return: true if state and msg are ok, otherwise false */ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, int bearer_id, struct sk_buff_head *xmitq) { struct tipc_msg *hdr = buf_msg(skb); int usr = msg_user(hdr); int mtyp = msg_type(hdr); u16 oseqno = msg_seqno(hdr); u16 exp_pkts = msg_msgcnt(hdr); u16 rcv_nxt, syncpt, dlv_nxt, inputq_len; int state = n->state; struct tipc_link *l, *tnl, *pl = NULL; struct tipc_media_addr *maddr; int pb_id; if (trace_tipc_node_check_state_enabled()) { trace_tipc_skb_dump(skb, false, "skb for node state check"); trace_tipc_node_check_state(n, true, " "); } l = n->links[bearer_id].link; if (!l) return false; rcv_nxt = tipc_link_rcv_nxt(l); if (likely((state == SELF_UP_PEER_UP) && (usr != TUNNEL_PROTOCOL))) return true; /* Find parallel link, if any */ for (pb_id = 0; pb_id < MAX_BEARERS; pb_id++) { if ((pb_id != bearer_id) && n->links[pb_id].link) { pl = n->links[pb_id].link; break; } } if (!tipc_link_validate_msg(l, hdr)) { trace_tipc_skb_dump(skb, false, "PROTO invalid (2)!"); trace_tipc_link_dump(l, TIPC_DUMP_NONE, "PROTO invalid (2)!"); return false; } /* Check and update node accesibility if applicable */ if (state == SELF_UP_PEER_COMING) { if (!tipc_link_is_up(l)) return true; if (!msg_peer_link_is_up(hdr)) return true; tipc_node_fsm_evt(n, PEER_ESTABL_CONTACT_EVT); } if (state == SELF_DOWN_PEER_LEAVING) { if (msg_peer_node_is_up(hdr)) return false; tipc_node_fsm_evt(n, PEER_LOST_CONTACT_EVT); return true; } if (state == SELF_LEAVING_PEER_DOWN) return false; /* Ignore duplicate packets */ if ((usr != LINK_PROTOCOL) && less(oseqno, rcv_nxt)) return true; /* Initiate or update failover mode if applicable */ if ((usr == TUNNEL_PROTOCOL) && (mtyp == FAILOVER_MSG)) { syncpt = oseqno + exp_pkts - 1; if (pl && !tipc_link_is_reset(pl)) { __tipc_node_link_down(n, &pb_id, xmitq, &maddr); trace_tipc_node_link_down(n, true, "node link down <- failover!"); tipc_skb_queue_splice_tail_init(tipc_link_inputq(pl), tipc_link_inputq(l)); } /* If parallel link was already down, and this happened before * the tunnel link came up, node failover was never started. * Ensure that a FAILOVER_MSG is sent to get peer out of * NODE_FAILINGOVER state, also this node must accept * TUNNEL_MSGs from peer. */ if (n->state != NODE_FAILINGOVER) tipc_node_link_failover(n, pl, l, xmitq); /* If pkts arrive out of order, use lowest calculated syncpt */ if (less(syncpt, n->sync_point)) n->sync_point = syncpt; } /* Open parallel link when tunnel link reaches synch point */ if ((n->state == NODE_FAILINGOVER) && tipc_link_is_up(l)) { if (!more(rcv_nxt, n->sync_point)) return true; tipc_node_fsm_evt(n, NODE_FAILOVER_END_EVT); if (pl) tipc_link_fsm_evt(pl, LINK_FAILOVER_END_EVT); return true; } /* No syncing needed if only one link */ if (!pl || !tipc_link_is_up(pl)) return true; /* Initiate synch mode if applicable */ if ((usr == TUNNEL_PROTOCOL) && (mtyp == SYNCH_MSG) && (oseqno == 1)) { if (n->capabilities & TIPC_TUNNEL_ENHANCED) syncpt = msg_syncpt(hdr); else syncpt = msg_seqno(msg_inner_hdr(hdr)) + exp_pkts - 1; if (!tipc_link_is_up(l)) __tipc_node_link_up(n, bearer_id, xmitq); if (n->state == SELF_UP_PEER_UP) { n->sync_point = syncpt; tipc_link_fsm_evt(l, LINK_SYNCH_BEGIN_EVT); tipc_node_fsm_evt(n, NODE_SYNCH_BEGIN_EVT); } } /* Open tunnel link when parallel link reaches synch point */ if (n->state == NODE_SYNCHING) { if (tipc_link_is_synching(l)) { tnl = l; } else { tnl = pl; pl = l; } inputq_len = skb_queue_len(tipc_link_inputq(pl)); dlv_nxt = tipc_link_rcv_nxt(pl) - inputq_len; if (more(dlv_nxt, n->sync_point)) { tipc_link_fsm_evt(tnl, LINK_SYNCH_END_EVT); tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT); return true; } if (l == pl) return true; if ((usr == TUNNEL_PROTOCOL) && (mtyp == SYNCH_MSG)) return true; if (usr == LINK_PROTOCOL) return true; return false; } return true; } /** * tipc_rcv - process TIPC packets/messages arriving from off-node * @net: the applicable net namespace * @skb: TIPC packet * @b: pointer to bearer message arrived on * * Invoked with no locks held. Bearer pointer must point to a valid bearer * structure (i.e. cannot be NULL), but bearer can be inactive. */ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) { struct sk_buff_head xmitq; struct tipc_link_entry *le; struct tipc_msg *hdr; struct tipc_node *n; int bearer_id = b->identity; u32 self = tipc_own_addr(net); int usr, rc = 0; u16 bc_ack; #ifdef CONFIG_TIPC_CRYPTO struct tipc_ehdr *ehdr; /* Check if message must be decrypted first */ if (TIPC_SKB_CB(skb)->decrypted || !tipc_ehdr_validate(skb)) goto rcv; ehdr = (struct tipc_ehdr *)skb->data; if (likely(ehdr->user != LINK_CONFIG)) { n = tipc_node_find(net, ntohl(ehdr->addr)); if (unlikely(!n)) goto discard; } else { n = tipc_node_find_by_id(net, ehdr->id); } tipc_crypto_rcv(net, (n) ? n->crypto_rx : NULL, &skb, b); if (!skb) return; rcv: #endif /* Ensure message is well-formed before touching the header */ if (unlikely(!tipc_msg_validate(&skb))) goto discard; __skb_queue_head_init(&xmitq); hdr = buf_msg(skb); usr = msg_user(hdr); bc_ack = msg_bcast_ack(hdr); /* Handle arrival of discovery or broadcast packet */ if (unlikely(msg_non_seq(hdr))) { if (unlikely(usr == LINK_CONFIG)) return tipc_disc_rcv(net, skb, b); else return tipc_node_bc_rcv(net, skb, bearer_id); } /* Discard unicast link messages destined for another node */ if (unlikely(!msg_short(hdr) && (msg_destnode(hdr) != self))) goto discard; /* Locate neighboring node that sent packet */ n = tipc_node_find(net, msg_prevnode(hdr)); if (unlikely(!n)) goto discard; le = &n->links[bearer_id]; /* Ensure broadcast reception is in synch with peer's send state */ if (unlikely(usr == LINK_PROTOCOL)) { if (unlikely(skb_linearize(skb))) { tipc_node_put(n); goto discard; } hdr = buf_msg(skb); tipc_node_bc_sync_rcv(n, hdr, bearer_id, &xmitq); } else if (unlikely(tipc_link_acked(n->bc_entry.link) != bc_ack)) { tipc_bcast_ack_rcv(net, n->bc_entry.link, hdr); } /* Receive packet directly if conditions permit */ tipc_node_read_lock(n); if (likely((n->state == SELF_UP_PEER_UP) && (usr != TUNNEL_PROTOCOL))) { spin_lock_bh(&le->lock); if (le->link) { rc = tipc_link_rcv(le->link, skb, &xmitq); skb = NULL; } spin_unlock_bh(&le->lock); } tipc_node_read_unlock(n); /* Check/update node state before receiving */ if (unlikely(skb)) { if (unlikely(skb_linearize(skb))) goto out_node_put; tipc_node_write_lock(n); if (tipc_node_check_state(n, skb, bearer_id, &xmitq)) { if (le->link) { rc = tipc_link_rcv(le->link, skb, &xmitq); skb = NULL; } } tipc_node_write_unlock(n); } if (unlikely(rc & TIPC_LINK_UP_EVT)) tipc_node_link_up(n, bearer_id, &xmitq); if (unlikely(rc & TIPC_LINK_DOWN_EVT)) tipc_node_link_down(n, bearer_id, false); if (unlikely(!skb_queue_empty(&n->bc_entry.namedq))) tipc_named_rcv(net, &n->bc_entry.namedq, &n->bc_entry.named_rcv_nxt, &n->bc_entry.named_open); if (unlikely(!skb_queue_empty(&n->bc_entry.inputq1))) tipc_node_mcast_rcv(n); if (!skb_queue_empty(&le->inputq)) tipc_sk_rcv(net, &le->inputq); if (!skb_queue_empty(&xmitq)) tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr, n); out_node_put: tipc_node_put(n); discard: kfree_skb(skb); } void tipc_node_apply_property(struct net *net, struct tipc_bearer *b, int prop) { struct tipc_net *tn = tipc_net(net); int bearer_id = b->identity; struct sk_buff_head xmitq; struct tipc_link_entry *e; struct tipc_node *n; __skb_queue_head_init(&xmitq); rcu_read_lock(); list_for_each_entry_rcu(n, &tn->node_list, list) { tipc_node_write_lock(n); e = &n->links[bearer_id]; if (e->link) { if (prop == TIPC_NLA_PROP_TOL) tipc_link_set_tolerance(e->link, b->tolerance, &xmitq); else if (prop == TIPC_NLA_PROP_MTU) tipc_link_set_mtu(e->link, b->mtu); /* Update MTU for node link entry */ e->mtu = tipc_link_mss(e->link); } tipc_node_write_unlock(n); tipc_bearer_xmit(net, bearer_id, &xmitq, &e->maddr, NULL); } rcu_read_unlock(); } int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct tipc_net *tn = net_generic(net, tipc_net_id); struct nlattr *attrs[TIPC_NLA_NET_MAX + 1]; struct tipc_node *peer, *temp_node; u8 node_id[NODE_ID_LEN]; u64 *w0 = (u64 *)&node_id[0]; u64 *w1 = (u64 *)&node_id[8]; u32 addr; int err; /* We identify the peer by its net */ if (!info->attrs[TIPC_NLA_NET]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_NET_MAX, info->attrs[TIPC_NLA_NET], tipc_nl_net_policy, info->extack); if (err) return err; /* attrs[TIPC_NLA_NET_NODEID] and attrs[TIPC_NLA_NET_ADDR] are * mutually exclusive cases */ if (attrs[TIPC_NLA_NET_ADDR]) { addr = nla_get_u32(attrs[TIPC_NLA_NET_ADDR]); if (!addr) return -EINVAL; } if (attrs[TIPC_NLA_NET_NODEID]) { if (!attrs[TIPC_NLA_NET_NODEID_W1]) return -EINVAL; *w0 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID]); *w1 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID_W1]); addr = hash128to32(node_id); } if (in_own_node(net, addr)) return -ENOTSUPP; spin_lock_bh(&tn->node_list_lock); peer = tipc_node_find(net, addr); if (!peer) { spin_unlock_bh(&tn->node_list_lock); return -ENXIO; } tipc_node_write_lock(peer); if (peer->state != SELF_DOWN_PEER_DOWN && peer->state != SELF_DOWN_PEER_LEAVING) { tipc_node_write_unlock(peer); err = -EBUSY; goto err_out; } tipc_node_clear_links(peer); tipc_node_write_unlock(peer); tipc_node_delete(peer); /* Calculate cluster capabilities */ tn->capabilities = TIPC_NODE_CAPABILITIES; list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } tipc_bcast_toggle_rcast(net, (tn->capabilities & TIPC_BCAST_RCAST)); err = 0; err_out: tipc_node_put(peer); spin_unlock_bh(&tn->node_list_lock); return err; } int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb) { int err; struct net *net = sock_net(skb->sk); struct tipc_net *tn = net_generic(net, tipc_net_id); int done = cb->args[0]; int last_addr = cb->args[1]; struct tipc_node *node; struct tipc_nl_msg msg; if (done) return 0; msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; rcu_read_lock(); if (last_addr) { node = tipc_node_find(net, last_addr); if (!node) { rcu_read_unlock(); /* We never set seq or call nl_dump_check_consistent() * this means that setting prev_seq here will cause the * consistence check to fail in the netlink callback * handler. Resulting in the NLMSG_DONE message having * the NLM_F_DUMP_INTR flag set if the node state * changed while we released the lock. */ cb->prev_seq = 1; return -EPIPE; } tipc_node_put(node); } list_for_each_entry_rcu(node, &tn->node_list, list) { if (node->preliminary) continue; if (last_addr) { if (node->addr == last_addr) last_addr = 0; else continue; } tipc_node_read_lock(node); err = __tipc_nl_add_node(&msg, node); if (err) { last_addr = node->addr; tipc_node_read_unlock(node); goto out; } tipc_node_read_unlock(node); } done = 1; out: cb->args[0] = done; cb->args[1] = last_addr; rcu_read_unlock(); return skb->len; } /* tipc_node_find_by_name - locate owner node of link by link's name * @net: the applicable net namespace * @name: pointer to link name string * @bearer_id: pointer to index in 'node->links' array where the link was found. * * Returns pointer to node owning the link, or 0 if no matching link is found. */ static struct tipc_node *tipc_node_find_by_name(struct net *net, const char *link_name, unsigned int *bearer_id) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_link *l; struct tipc_node *n; struct tipc_node *found_node = NULL; int i; *bearer_id = 0; rcu_read_lock(); list_for_each_entry_rcu(n, &tn->node_list, list) { tipc_node_read_lock(n); for (i = 0; i < MAX_BEARERS; i++) { l = n->links[i].link; if (l && !strcmp(tipc_link_name(l), link_name)) { *bearer_id = i; found_node = n; break; } } tipc_node_read_unlock(n); if (found_node) break; } rcu_read_unlock(); return found_node; } int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info) { int err; int res = 0; int bearer_id; char *name; struct tipc_link *link; struct tipc_node *node; struct sk_buff_head xmitq; struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1]; struct net *net = sock_net(skb->sk); __skb_queue_head_init(&xmitq); if (!info->attrs[TIPC_NLA_LINK]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_LINK_MAX, info->attrs[TIPC_NLA_LINK], tipc_nl_link_policy, info->extack); if (err) return err; if (!attrs[TIPC_NLA_LINK_NAME]) return -EINVAL; name = nla_data(attrs[TIPC_NLA_LINK_NAME]); if (strcmp(name, tipc_bclink_name) == 0) return tipc_nl_bc_link_set(net, attrs); node = tipc_node_find_by_name(net, name, &bearer_id); if (!node) return -EINVAL; tipc_node_read_lock(node); link = node->links[bearer_id].link; if (!link) { res = -EINVAL; goto out; } if (attrs[TIPC_NLA_LINK_PROP]) { struct nlattr *props[TIPC_NLA_PROP_MAX + 1]; err = tipc_nl_parse_link_prop(attrs[TIPC_NLA_LINK_PROP], props); if (err) { res = err; goto out; } if (props[TIPC_NLA_PROP_TOL]) { u32 tol; tol = nla_get_u32(props[TIPC_NLA_PROP_TOL]); tipc_link_set_tolerance(link, tol, &xmitq); } if (props[TIPC_NLA_PROP_PRIO]) { u32 prio; prio = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); tipc_link_set_prio(link, prio, &xmitq); } if (props[TIPC_NLA_PROP_WIN]) { u32 max_win; max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); tipc_link_set_queue_limits(link, tipc_link_min_win(link), max_win); } } out: tipc_node_read_unlock(node); tipc_bearer_xmit(net, bearer_id, &xmitq, &node->links[bearer_id].maddr, NULL); return res; } int tipc_nl_node_get_link(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1]; struct tipc_nl_msg msg; char *name; int err; msg.portid = info->snd_portid; msg.seq = info->snd_seq; if (!info->attrs[TIPC_NLA_LINK]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_LINK_MAX, info->attrs[TIPC_NLA_LINK], tipc_nl_link_policy, info->extack); if (err) return err; if (!attrs[TIPC_NLA_LINK_NAME]) return -EINVAL; name = nla_data(attrs[TIPC_NLA_LINK_NAME]); msg.skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!msg.skb) return -ENOMEM; if (strcmp(name, tipc_bclink_name) == 0) { err = tipc_nl_add_bc_link(net, &msg, tipc_net(net)->bcl); if (err) goto err_free; } else { int bearer_id; struct tipc_node *node; struct tipc_link *link; node = tipc_node_find_by_name(net, name, &bearer_id); if (!node) { err = -EINVAL; goto err_free; } tipc_node_read_lock(node); link = node->links[bearer_id].link; if (!link) { tipc_node_read_unlock(node); err = -EINVAL; goto err_free; } err = __tipc_nl_add_link(net, &msg, link, 0); tipc_node_read_unlock(node); if (err) goto err_free; } return genlmsg_reply(msg.skb, info); err_free: nlmsg_free(msg.skb); return err; } int tipc_nl_node_reset_link_stats(struct sk_buff *skb, struct genl_info *info) { int err; char *link_name; unsigned int bearer_id; struct tipc_link *link; struct tipc_node *node; struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1]; struct net *net = sock_net(skb->sk); struct tipc_net *tn = tipc_net(net); struct tipc_link_entry *le; if (!info->attrs[TIPC_NLA_LINK]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_LINK_MAX, info->attrs[TIPC_NLA_LINK], tipc_nl_link_policy, info->extack); if (err) return err; if (!attrs[TIPC_NLA_LINK_NAME]) return -EINVAL; link_name = nla_data(attrs[TIPC_NLA_LINK_NAME]); err = -EINVAL; if (!strcmp(link_name, tipc_bclink_name)) { err = tipc_bclink_reset_stats(net, tipc_bc_sndlink(net)); if (err) return err; return 0; } else if (strstr(link_name, tipc_bclink_name)) { rcu_read_lock(); list_for_each_entry_rcu(node, &tn->node_list, list) { tipc_node_read_lock(node); link = node->bc_entry.link; if (link && !strcmp(link_name, tipc_link_name(link))) { err = tipc_bclink_reset_stats(net, link); tipc_node_read_unlock(node); break; } tipc_node_read_unlock(node); } rcu_read_unlock(); return err; } node = tipc_node_find_by_name(net, link_name, &bearer_id); if (!node) return -EINVAL; le = &node->links[bearer_id]; tipc_node_read_lock(node); spin_lock_bh(&le->lock); link = node->links[bearer_id].link; if (!link) { spin_unlock_bh(&le->lock); tipc_node_read_unlock(node); return -EINVAL; } tipc_link_reset_stats(link); spin_unlock_bh(&le->lock); tipc_node_read_unlock(node); return 0; } /* Caller should hold node lock */ static int __tipc_nl_add_node_links(struct net *net, struct tipc_nl_msg *msg, struct tipc_node *node, u32 *prev_link, bool bc_link) { u32 i; int err; for (i = *prev_link; i < MAX_BEARERS; i++) { *prev_link = i; if (!node->links[i].link) continue; err = __tipc_nl_add_link(net, msg, node->links[i].link, NLM_F_MULTI); if (err) return err; } if (bc_link) { *prev_link = i; err = tipc_nl_add_bc_link(net, msg, node->bc_entry.link); if (err) return err; } *prev_link = 0; return 0; } int tipc_nl_node_dump_link(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct nlattr **attrs = genl_dumpit_info(cb)->info.attrs; struct nlattr *link[TIPC_NLA_LINK_MAX + 1]; struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_node *node; struct tipc_nl_msg msg; u32 prev_node = cb->args[0]; u32 prev_link = cb->args[1]; int done = cb->args[2]; bool bc_link = cb->args[3]; int err; if (done) return 0; if (!prev_node) { /* Check if broadcast-receiver links dumping is needed */ if (attrs && attrs[TIPC_NLA_LINK]) { err = nla_parse_nested_deprecated(link, TIPC_NLA_LINK_MAX, attrs[TIPC_NLA_LINK], tipc_nl_link_policy, NULL); if (unlikely(err)) return err; if (unlikely(!link[TIPC_NLA_LINK_BROADCAST])) return -EINVAL; bc_link = true; } } msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; rcu_read_lock(); if (prev_node) { node = tipc_node_find(net, prev_node); if (!node) { /* We never set seq or call nl_dump_check_consistent() * this means that setting prev_seq here will cause the * consistence check to fail in the netlink callback * handler. Resulting in the last NLMSG_DONE message * having the NLM_F_DUMP_INTR flag set. */ cb->prev_seq = 1; goto out; } tipc_node_put(node); list_for_each_entry_continue_rcu(node, &tn->node_list, list) { tipc_node_read_lock(node); err = __tipc_nl_add_node_links(net, &msg, node, &prev_link, bc_link); tipc_node_read_unlock(node); if (err) goto out; prev_node = node->addr; } } else { err = tipc_nl_add_bc_link(net, &msg, tn->bcl); if (err) goto out; list_for_each_entry_rcu(node, &tn->node_list, list) { tipc_node_read_lock(node); err = __tipc_nl_add_node_links(net, &msg, node, &prev_link, bc_link); tipc_node_read_unlock(node); if (err) goto out; prev_node = node->addr; } } done = 1; out: rcu_read_unlock(); cb->args[0] = prev_node; cb->args[1] = prev_link; cb->args[2] = done; cb->args[3] = bc_link; return skb->len; } int tipc_nl_node_set_monitor(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attrs[TIPC_NLA_MON_MAX + 1]; struct net *net = sock_net(skb->sk); int err; if (!info->attrs[TIPC_NLA_MON]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_MON_MAX, info->attrs[TIPC_NLA_MON], tipc_nl_monitor_policy, info->extack); if (err) return err; if (attrs[TIPC_NLA_MON_ACTIVATION_THRESHOLD]) { u32 val; val = nla_get_u32(attrs[TIPC_NLA_MON_ACTIVATION_THRESHOLD]); err = tipc_nl_monitor_set_threshold(net, val); if (err) return err; } return 0; } static int __tipc_nl_add_monitor_prop(struct net *net, struct tipc_nl_msg *msg) { struct nlattr *attrs; void *hdr; u32 val; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, 0, TIPC_NL_MON_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_MON); if (!attrs) goto msg_full; val = tipc_nl_monitor_get_threshold(net); if (nla_put_u32(msg->skb, TIPC_NLA_MON_ACTIVATION_THRESHOLD, val)) goto attr_msg_full; nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); return 0; attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; } int tipc_nl_node_get_monitor(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct tipc_nl_msg msg; int err; msg.skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!msg.skb) return -ENOMEM; msg.portid = info->snd_portid; msg.seq = info->snd_seq; err = __tipc_nl_add_monitor_prop(net, &msg); if (err) { nlmsg_free(msg.skb); return err; } return genlmsg_reply(msg.skb, info); } int tipc_nl_node_dump_monitor(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); u32 prev_bearer = cb->args[0]; struct tipc_nl_msg msg; int bearer_id; int err; if (prev_bearer == MAX_BEARERS) return 0; msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; rtnl_lock(); for (bearer_id = prev_bearer; bearer_id < MAX_BEARERS; bearer_id++) { err = __tipc_nl_add_monitor(net, &msg, bearer_id); if (err) break; } rtnl_unlock(); cb->args[0] = bearer_id; return skb->len; } int tipc_nl_node_dump_monitor_peer(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); u32 prev_node = cb->args[1]; u32 bearer_id = cb->args[2]; int done = cb->args[0]; struct tipc_nl_msg msg; int err; if (!prev_node) { struct nlattr **attrs = genl_dumpit_info(cb)->info.attrs; struct nlattr *mon[TIPC_NLA_MON_MAX + 1]; if (!attrs[TIPC_NLA_MON]) return -EINVAL; err = nla_parse_nested_deprecated(mon, TIPC_NLA_MON_MAX, attrs[TIPC_NLA_MON], tipc_nl_monitor_policy, NULL); if (err) return err; if (!mon[TIPC_NLA_MON_REF]) return -EINVAL; bearer_id = nla_get_u32(mon[TIPC_NLA_MON_REF]); if (bearer_id >= MAX_BEARERS) return -EINVAL; } if (done) return 0; msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; rtnl_lock(); err = tipc_nl_add_monitor_peer(net, &msg, bearer_id, &prev_node); if (!err) done = 1; rtnl_unlock(); cb->args[0] = done; cb->args[1] = prev_node; cb->args[2] = bearer_id; return skb->len; } #ifdef CONFIG_TIPC_CRYPTO static int tipc_nl_retrieve_key(struct nlattr **attrs, struct tipc_aead_key **pkey) { struct nlattr *attr = attrs[TIPC_NLA_NODE_KEY]; struct tipc_aead_key *key; if (!attr) return -ENODATA; if (nla_len(attr) < sizeof(*key)) return -EINVAL; key = (struct tipc_aead_key *)nla_data(attr); if (key->keylen > TIPC_AEAD_KEYLEN_MAX || nla_len(attr) < tipc_aead_key_size(key)) return -EINVAL; *pkey = key; return 0; } static int tipc_nl_retrieve_nodeid(struct nlattr **attrs, u8 **node_id) { struct nlattr *attr = attrs[TIPC_NLA_NODE_ID]; if (!attr) return -ENODATA; if (nla_len(attr) < TIPC_NODEID_LEN) return -EINVAL; *node_id = (u8 *)nla_data(attr); return 0; } static int tipc_nl_retrieve_rekeying(struct nlattr **attrs, u32 *intv) { struct nlattr *attr = attrs[TIPC_NLA_NODE_REKEYING]; if (!attr) return -ENODATA; *intv = nla_get_u32(attr); return 0; } static int __tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attrs[TIPC_NLA_NODE_MAX + 1]; struct net *net = sock_net(skb->sk); struct tipc_crypto *tx = tipc_net(net)->crypto_tx, *c = tx; struct tipc_node *n = NULL; struct tipc_aead_key *ukey; bool rekeying = true, master_key = false; u8 *id, *own_id, mode; u32 intv = 0; int rc = 0; if (!info->attrs[TIPC_NLA_NODE]) return -EINVAL; rc = nla_parse_nested(attrs, TIPC_NLA_NODE_MAX, info->attrs[TIPC_NLA_NODE], tipc_nl_node_policy, info->extack); if (rc) return rc; own_id = tipc_own_id(net); if (!own_id) { GENL_SET_ERR_MSG(info, "not found own node identity (set id?)"); return -EPERM; } rc = tipc_nl_retrieve_rekeying(attrs, &intv); if (rc == -ENODATA) rekeying = false; rc = tipc_nl_retrieve_key(attrs, &ukey); if (rc == -ENODATA && rekeying) goto rekeying; else if (rc) return rc; rc = tipc_aead_key_validate(ukey, info); if (rc) return rc; rc = tipc_nl_retrieve_nodeid(attrs, &id); switch (rc) { case -ENODATA: mode = CLUSTER_KEY; master_key = !!(attrs[TIPC_NLA_NODE_KEY_MASTER]); break; case 0: mode = PER_NODE_KEY; if (memcmp(id, own_id, NODE_ID_LEN)) { n = tipc_node_find_by_id(net, id) ?: tipc_node_create(net, 0, id, 0xffffu, 0, true); if (unlikely(!n)) return -ENOMEM; c = n->crypto_rx; } break; default: return rc; } /* Initiate the TX/RX key */ rc = tipc_crypto_key_init(c, ukey, mode, master_key); if (n) tipc_node_put(n); if (unlikely(rc < 0)) { GENL_SET_ERR_MSG(info, "unable to initiate or attach new key"); return rc; } else if (c == tx) { /* Distribute TX key but not master one */ if (!master_key && tipc_crypto_key_distr(tx, rc, NULL)) GENL_SET_ERR_MSG(info, "failed to replicate new key"); rekeying: /* Schedule TX rekeying if needed */ tipc_crypto_rekeying_sched(tx, rekeying, intv); } return 0; } int tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) { int err; rtnl_lock(); err = __tipc_nl_node_set_key(skb, info); rtnl_unlock(); return err; } static int __tipc_nl_node_flush_key(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct tipc_net *tn = tipc_net(net); struct tipc_node *n; tipc_crypto_key_flush(tn->crypto_tx); rcu_read_lock(); list_for_each_entry_rcu(n, &tn->node_list, list) tipc_crypto_key_flush(n->crypto_rx); rcu_read_unlock(); return 0; } int tipc_nl_node_flush_key(struct sk_buff *skb, struct genl_info *info) { int err; rtnl_lock(); err = __tipc_nl_node_flush_key(skb, info); rtnl_unlock(); return err; } #endif /** * tipc_node_dump - dump TIPC node data * @n: tipc node to be dumped * @more: dump more? * - false: dump only tipc node data * - true: dump node link data as well * @buf: returned buffer of dump data in format */ int tipc_node_dump(struct tipc_node *n, bool more, char *buf) { int i = 0; size_t sz = (more) ? NODE_LMAX : NODE_LMIN; if (!n) { i += scnprintf(buf, sz, "node data: (null)\n"); return i; } i += scnprintf(buf, sz, "node data: %x", n->addr); i += scnprintf(buf + i, sz - i, " %x", n->state); i += scnprintf(buf + i, sz - i, " %d", n->active_links[0]); i += scnprintf(buf + i, sz - i, " %d", n->active_links[1]); i += scnprintf(buf + i, sz - i, " %x", n->action_flags); i += scnprintf(buf + i, sz - i, " %u", n->failover_sent); i += scnprintf(buf + i, sz - i, " %u", n->sync_point); i += scnprintf(buf + i, sz - i, " %d", n->link_cnt); i += scnprintf(buf + i, sz - i, " %u", n->working_links); i += scnprintf(buf + i, sz - i, " %x", n->capabilities); i += scnprintf(buf + i, sz - i, " %lu\n", n->keepalive_intv); if (!more) return i; i += scnprintf(buf + i, sz - i, "link_entry[0]:\n"); i += scnprintf(buf + i, sz - i, " mtu: %u\n", n->links[0].mtu); i += scnprintf(buf + i, sz - i, " media: "); i += tipc_media_addr_printf(buf + i, sz - i, &n->links[0].maddr); i += scnprintf(buf + i, sz - i, "\n"); i += tipc_link_dump(n->links[0].link, TIPC_DUMP_NONE, buf + i); i += scnprintf(buf + i, sz - i, " inputq: "); i += tipc_list_dump(&n->links[0].inputq, false, buf + i); i += scnprintf(buf + i, sz - i, "link_entry[1]:\n"); i += scnprintf(buf + i, sz - i, " mtu: %u\n", n->links[1].mtu); i += scnprintf(buf + i, sz - i, " media: "); i += tipc_media_addr_printf(buf + i, sz - i, &n->links[1].maddr); i += scnprintf(buf + i, sz - i, "\n"); i += tipc_link_dump(n->links[1].link, TIPC_DUMP_NONE, buf + i); i += scnprintf(buf + i, sz - i, " inputq: "); i += tipc_list_dump(&n->links[1].inputq, false, buf + i); i += scnprintf(buf + i, sz - i, "bclink:\n "); i += tipc_link_dump(n->bc_entry.link, TIPC_DUMP_NONE, buf + i); return i; } void tipc_node_pre_cleanup_net(struct net *exit_net) { struct tipc_node *n; struct tipc_net *tn; struct net *tmp; rcu_read_lock(); for_each_net_rcu(tmp) { if (tmp == exit_net) continue; tn = tipc_net(tmp); if (!tn) continue; spin_lock_bh(&tn->node_list_lock); list_for_each_entry_rcu(n, &tn->node_list, list) { if (!n->peer_net) continue; if (n->peer_net != exit_net) continue; tipc_node_write_lock(n); n->peer_net = NULL; n->peer_hash_mix = 0; tipc_node_write_unlock_fast(n); break; } spin_unlock_bh(&tn->node_list_lock); } rcu_read_unlock(); } |
6 6 281 6 4 1 3 1 2 1 2 6 6 6 4 3 1 1 4 2 2 2 3 3 5 5 5 5 3 4 4 1 4 1 4 3 6 3 6 6 1 1 1 6 4 2 4 4 4 281 239 9 9 1629 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 | // SPDX-License-Identifier: GPL-2.0 /* Generic nexthop implementation * * Copyright (c) 2017-19 Cumulus Networks * Copyright (c) 2017-19 David Ahern <dsa@cumulusnetworks.com> */ #include <linux/nexthop.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <net/arp.h> #include <net/ipv6_stubs.h> #include <net/lwtunnel.h> #include <net/ndisc.h> #include <net/nexthop.h> #include <net/route.h> #include <net/sock.h> #define NH_RES_DEFAULT_IDLE_TIMER (120 * HZ) #define NH_RES_DEFAULT_UNBALANCED_TIMER 0 /* No forced rebalancing. */ static void remove_nexthop(struct net *net, struct nexthop *nh, struct nl_info *nlinfo); #define NH_DEV_HASHBITS 8 #define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS) static const struct nla_policy rtm_nh_policy_new[] = { [NHA_ID] = { .type = NLA_U32 }, [NHA_GROUP] = { .type = NLA_BINARY }, [NHA_GROUP_TYPE] = { .type = NLA_U16 }, [NHA_BLACKHOLE] = { .type = NLA_FLAG }, [NHA_OIF] = { .type = NLA_U32 }, [NHA_GATEWAY] = { .type = NLA_BINARY }, [NHA_ENCAP_TYPE] = { .type = NLA_U16 }, [NHA_ENCAP] = { .type = NLA_NESTED }, [NHA_FDB] = { .type = NLA_FLAG }, [NHA_RES_GROUP] = { .type = NLA_NESTED }, }; static const struct nla_policy rtm_nh_policy_get[] = { [NHA_ID] = { .type = NLA_U32 }, }; static const struct nla_policy rtm_nh_policy_dump[] = { [NHA_OIF] = { .type = NLA_U32 }, [NHA_GROUPS] = { .type = NLA_FLAG }, [NHA_MASTER] = { .type = NLA_U32 }, [NHA_FDB] = { .type = NLA_FLAG }, }; static const struct nla_policy rtm_nh_res_policy_new[] = { [NHA_RES_GROUP_BUCKETS] = { .type = NLA_U16 }, [NHA_RES_GROUP_IDLE_TIMER] = { .type = NLA_U32 }, [NHA_RES_GROUP_UNBALANCED_TIMER] = { .type = NLA_U32 }, }; static const struct nla_policy rtm_nh_policy_dump_bucket[] = { [NHA_ID] = { .type = NLA_U32 }, [NHA_OIF] = { .type = NLA_U32 }, [NHA_MASTER] = { .type = NLA_U32 }, [NHA_RES_BUCKET] = { .type = NLA_NESTED }, }; static const struct nla_policy rtm_nh_res_bucket_policy_dump[] = { [NHA_RES_BUCKET_NH_ID] = { .type = NLA_U32 }, }; static const struct nla_policy rtm_nh_policy_get_bucket[] = { [NHA_ID] = { .type = NLA_U32 }, [NHA_RES_BUCKET] = { .type = NLA_NESTED }, }; static const struct nla_policy rtm_nh_res_bucket_policy_get[] = { [NHA_RES_BUCKET_INDEX] = { .type = NLA_U16 }, }; static bool nexthop_notifiers_is_empty(struct net *net) { return !net->nexthop.notifier_chain.head; } static void __nh_notifier_single_info_init(struct nh_notifier_single_info *nh_info, const struct nh_info *nhi) { nh_info->dev = nhi->fib_nhc.nhc_dev; nh_info->gw_family = nhi->fib_nhc.nhc_gw_family; if (nh_info->gw_family == AF_INET) nh_info->ipv4 = nhi->fib_nhc.nhc_gw.ipv4; else if (nh_info->gw_family == AF_INET6) nh_info->ipv6 = nhi->fib_nhc.nhc_gw.ipv6; nh_info->is_reject = nhi->reject_nh; nh_info->is_fdb = nhi->fdb_nh; nh_info->has_encap = !!nhi->fib_nhc.nhc_lwtstate; } static int nh_notifier_single_info_init(struct nh_notifier_info *info, const struct nexthop *nh) { struct nh_info *nhi = rtnl_dereference(nh->nh_info); info->type = NH_NOTIFIER_INFO_TYPE_SINGLE; info->nh = kzalloc(sizeof(*info->nh), GFP_KERNEL); if (!info->nh) return -ENOMEM; __nh_notifier_single_info_init(info->nh, nhi); return 0; } static void nh_notifier_single_info_fini(struct nh_notifier_info *info) { kfree(info->nh); } static int nh_notifier_mpath_info_init(struct nh_notifier_info *info, struct nh_group *nhg) { u16 num_nh = nhg->num_nh; int i; info->type = NH_NOTIFIER_INFO_TYPE_GRP; info->nh_grp = kzalloc(struct_size(info->nh_grp, nh_entries, num_nh), GFP_KERNEL); if (!info->nh_grp) return -ENOMEM; info->nh_grp->num_nh = num_nh; info->nh_grp->is_fdb = nhg->fdb_nh; for (i = 0; i < num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; struct nh_info *nhi; nhi = rtnl_dereference(nhge->nh->nh_info); info->nh_grp->nh_entries[i].id = nhge->nh->id; info->nh_grp->nh_entries[i].weight = nhge->weight; __nh_notifier_single_info_init(&info->nh_grp->nh_entries[i].nh, nhi); } return 0; } static int nh_notifier_res_table_info_init(struct nh_notifier_info *info, struct nh_group *nhg) { struct nh_res_table *res_table = rtnl_dereference(nhg->res_table); u16 num_nh_buckets = res_table->num_nh_buckets; unsigned long size; u16 i; info->type = NH_NOTIFIER_INFO_TYPE_RES_TABLE; size = struct_size(info->nh_res_table, nhs, num_nh_buckets); info->nh_res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN); if (!info->nh_res_table) return -ENOMEM; info->nh_res_table->num_nh_buckets = num_nh_buckets; for (i = 0; i < num_nh_buckets; i++) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; struct nh_grp_entry *nhge; struct nh_info *nhi; nhge = rtnl_dereference(bucket->nh_entry); nhi = rtnl_dereference(nhge->nh->nh_info); __nh_notifier_single_info_init(&info->nh_res_table->nhs[i], nhi); } return 0; } static int nh_notifier_grp_info_init(struct nh_notifier_info *info, const struct nexthop *nh) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); if (nhg->hash_threshold) return nh_notifier_mpath_info_init(info, nhg); else if (nhg->resilient) return nh_notifier_res_table_info_init(info, nhg); return -EINVAL; } static void nh_notifier_grp_info_fini(struct nh_notifier_info *info, const struct nexthop *nh) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); if (nhg->hash_threshold) kfree(info->nh_grp); else if (nhg->resilient) vfree(info->nh_res_table); } static int nh_notifier_info_init(struct nh_notifier_info *info, const struct nexthop *nh) { info->id = nh->id; if (nh->is_group) return nh_notifier_grp_info_init(info, nh); else return nh_notifier_single_info_init(info, nh); } static void nh_notifier_info_fini(struct nh_notifier_info *info, const struct nexthop *nh) { if (nh->is_group) nh_notifier_grp_info_fini(info, nh); else nh_notifier_single_info_fini(info); } static int call_nexthop_notifiers(struct net *net, enum nexthop_event_type event_type, struct nexthop *nh, struct netlink_ext_ack *extack) { struct nh_notifier_info info = { .net = net, .extack = extack, }; int err; ASSERT_RTNL(); if (nexthop_notifiers_is_empty(net)) return 0; err = nh_notifier_info_init(&info, nh); if (err) { NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info"); return err; } err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, event_type, &info); nh_notifier_info_fini(&info, nh); return notifier_to_errno(err); } static int nh_notifier_res_bucket_idle_timer_get(const struct nh_notifier_info *info, bool force, unsigned int *p_idle_timer_ms) { struct nh_res_table *res_table; struct nh_group *nhg; struct nexthop *nh; int err = 0; /* When 'force' is false, nexthop bucket replacement is performed * because the bucket was deemed to be idle. In this case, capable * listeners can choose to perform an atomic replacement: The bucket is * only replaced if it is inactive. However, if the idle timer interval * is smaller than the interval in which a listener is querying * buckets' activity from the device, then atomic replacement should * not be tried. Pass the idle timer value to listeners, so that they * could determine which type of replacement to perform. */ if (force) { *p_idle_timer_ms = 0; return 0; } rcu_read_lock(); nh = nexthop_find_by_id(info->net, info->id); if (!nh) { err = -EINVAL; goto out; } nhg = rcu_dereference(nh->nh_grp); res_table = rcu_dereference(nhg->res_table); *p_idle_timer_ms = jiffies_to_msecs(res_table->idle_timer); out: rcu_read_unlock(); return err; } static int nh_notifier_res_bucket_info_init(struct nh_notifier_info *info, u16 bucket_index, bool force, struct nh_info *oldi, struct nh_info *newi) { unsigned int idle_timer_ms; int err; err = nh_notifier_res_bucket_idle_timer_get(info, force, &idle_timer_ms); if (err) return err; info->type = NH_NOTIFIER_INFO_TYPE_RES_BUCKET; info->nh_res_bucket = kzalloc(sizeof(*info->nh_res_bucket), GFP_KERNEL); if (!info->nh_res_bucket) return -ENOMEM; info->nh_res_bucket->bucket_index = bucket_index; info->nh_res_bucket->idle_timer_ms = idle_timer_ms; info->nh_res_bucket->force = force; __nh_notifier_single_info_init(&info->nh_res_bucket->old_nh, oldi); __nh_notifier_single_info_init(&info->nh_res_bucket->new_nh, newi); return 0; } static void nh_notifier_res_bucket_info_fini(struct nh_notifier_info *info) { kfree(info->nh_res_bucket); } static int __call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id, u16 bucket_index, bool force, struct nh_info *oldi, struct nh_info *newi, struct netlink_ext_ack *extack) { struct nh_notifier_info info = { .net = net, .extack = extack, .id = nhg_id, }; int err; if (nexthop_notifiers_is_empty(net)) return 0; err = nh_notifier_res_bucket_info_init(&info, bucket_index, force, oldi, newi); if (err) return err; err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, NEXTHOP_EVENT_BUCKET_REPLACE, &info); nh_notifier_res_bucket_info_fini(&info); return notifier_to_errno(err); } /* There are three users of RES_TABLE, and NHs etc. referenced from there: * * 1) a collection of callbacks for NH maintenance. This operates under * RTNL, * 2) the delayed work that gradually balances the resilient table, * 3) and nexthop_select_path(), operating under RCU. * * Both the delayed work and the RTNL block are writers, and need to * maintain mutual exclusion. Since there are only two and well-known * writers for each table, the RTNL code can make sure it has exclusive * access thus: * * - Have the DW operate without locking; * - synchronously cancel the DW; * - do the writing; * - if the write was not actually a delete, call upkeep, which schedules * DW again if necessary. * * The functions that are always called from the RTNL context use * rtnl_dereference(). The functions that can also be called from the DW do * a raw dereference and rely on the above mutual exclusion scheme. */ #define nh_res_dereference(p) (rcu_dereference_raw(p)) static int call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id, u16 bucket_index, bool force, struct nexthop *old_nh, struct nexthop *new_nh, struct netlink_ext_ack *extack) { struct nh_info *oldi = nh_res_dereference(old_nh->nh_info); struct nh_info *newi = nh_res_dereference(new_nh->nh_info); return __call_nexthop_res_bucket_notifiers(net, nhg_id, bucket_index, force, oldi, newi, extack); } static int call_nexthop_res_table_notifiers(struct net *net, struct nexthop *nh, struct netlink_ext_ack *extack) { struct nh_notifier_info info = { .net = net, .extack = extack, }; struct nh_group *nhg; int err; ASSERT_RTNL(); if (nexthop_notifiers_is_empty(net)) return 0; /* At this point, the nexthop buckets are still not populated. Only * emit a notification with the logical nexthops, so that a listener * could potentially veto it in case of unsupported configuration. */ nhg = rtnl_dereference(nh->nh_grp); err = nh_notifier_mpath_info_init(&info, nhg); if (err) { NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info"); return err; } err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE, &info); kfree(info.nh_grp); return notifier_to_errno(err); } static int call_nexthop_notifier(struct notifier_block *nb, struct net *net, enum nexthop_event_type event_type, struct nexthop *nh, struct netlink_ext_ack *extack) { struct nh_notifier_info info = { .net = net, .extack = extack, }; int err; err = nh_notifier_info_init(&info, nh); if (err) return err; err = nb->notifier_call(nb, event_type, &info); nh_notifier_info_fini(&info, nh); return notifier_to_errno(err); } static unsigned int nh_dev_hashfn(unsigned int val) { unsigned int mask = NH_DEV_HASHSIZE - 1; return (val ^ (val >> NH_DEV_HASHBITS) ^ (val >> (NH_DEV_HASHBITS * 2))) & mask; } static void nexthop_devhash_add(struct net *net, struct nh_info *nhi) { struct net_device *dev = nhi->fib_nhc.nhc_dev; struct hlist_head *head; unsigned int hash; WARN_ON(!dev); hash = nh_dev_hashfn(dev->ifindex); head = &net->nexthop.devhash[hash]; hlist_add_head(&nhi->dev_hash, head); } static void nexthop_free_group(struct nexthop *nh) { struct nh_group *nhg; int i; nhg = rcu_dereference_raw(nh->nh_grp); for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; WARN_ON(!list_empty(&nhge->nh_list)); nexthop_put(nhge->nh); } WARN_ON(nhg->spare == nhg); if (nhg->resilient) vfree(rcu_dereference_raw(nhg->res_table)); kfree(nhg->spare); kfree(nhg); } static void nexthop_free_single(struct nexthop *nh) { struct nh_info *nhi; nhi = rcu_dereference_raw(nh->nh_info); switch (nhi->family) { case AF_INET: fib_nh_release(nh->net, &nhi->fib_nh); break; case AF_INET6: ipv6_stub->fib6_nh_release(&nhi->fib6_nh); break; } kfree(nhi); } void nexthop_free_rcu(struct rcu_head *head) { struct nexthop *nh = container_of(head, struct nexthop, rcu); if (nh->is_group) nexthop_free_group(nh); else nexthop_free_single(nh); kfree(nh); } EXPORT_SYMBOL_GPL(nexthop_free_rcu); static struct nexthop *nexthop_alloc(void) { struct nexthop *nh; nh = kzalloc(sizeof(struct nexthop), GFP_KERNEL); if (nh) { INIT_LIST_HEAD(&nh->fi_list); INIT_LIST_HEAD(&nh->f6i_list); INIT_LIST_HEAD(&nh->grp_list); INIT_LIST_HEAD(&nh->fdb_list); } return nh; } static struct nh_group *nexthop_grp_alloc(u16 num_nh) { struct nh_group *nhg; nhg = kzalloc(struct_size(nhg, nh_entries, num_nh), GFP_KERNEL); if (nhg) nhg->num_nh = num_nh; return nhg; } static void nh_res_table_upkeep_dw(struct work_struct *work); static struct nh_res_table * nexthop_res_table_alloc(struct net *net, u32 nhg_id, struct nh_config *cfg) { const u16 num_nh_buckets = cfg->nh_grp_res_num_buckets; struct nh_res_table *res_table; unsigned long size; size = struct_size(res_table, nh_buckets, num_nh_buckets); res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN); if (!res_table) return NULL; res_table->net = net; res_table->nhg_id = nhg_id; INIT_DELAYED_WORK(&res_table->upkeep_dw, &nh_res_table_upkeep_dw); INIT_LIST_HEAD(&res_table->uw_nh_entries); res_table->idle_timer = cfg->nh_grp_res_idle_timer; res_table->unbalanced_timer = cfg->nh_grp_res_unbalanced_timer; res_table->num_nh_buckets = num_nh_buckets; return res_table; } static void nh_base_seq_inc(struct net *net) { while (++net->nexthop.seq == 0) ; } /* no reference taken; rcu lock or rtnl must be held */ struct nexthop *nexthop_find_by_id(struct net *net, u32 id) { struct rb_node **pp, *parent = NULL, *next; pp = &net->nexthop.rb_root.rb_node; while (1) { struct nexthop *nh; next = rcu_dereference_raw(*pp); if (!next) break; parent = next; nh = rb_entry(parent, struct nexthop, rb_node); if (id < nh->id) pp = &next->rb_left; else if (id > nh->id) pp = &next->rb_right; else return nh; } return NULL; } EXPORT_SYMBOL_GPL(nexthop_find_by_id); /* used for auto id allocation; called with rtnl held */ static u32 nh_find_unused_id(struct net *net) { u32 id_start = net->nexthop.last_id_allocated; while (1) { net->nexthop.last_id_allocated++; if (net->nexthop.last_id_allocated == id_start) break; if (!nexthop_find_by_id(net, net->nexthop.last_id_allocated)) return net->nexthop.last_id_allocated; } return 0; } static void nh_res_time_set_deadline(unsigned long next_time, unsigned long *deadline) { if (time_before(next_time, *deadline)) *deadline = next_time; } static clock_t nh_res_table_unbalanced_time(struct nh_res_table *res_table) { if (list_empty(&res_table->uw_nh_entries)) return 0; return jiffies_delta_to_clock_t(jiffies - res_table->unbalanced_since); } static int nla_put_nh_group_res(struct sk_buff *skb, struct nh_group *nhg) { struct nh_res_table *res_table = rtnl_dereference(nhg->res_table); struct nlattr *nest; nest = nla_nest_start(skb, NHA_RES_GROUP); if (!nest) return -EMSGSIZE; if (nla_put_u16(skb, NHA_RES_GROUP_BUCKETS, res_table->num_nh_buckets) || nla_put_u32(skb, NHA_RES_GROUP_IDLE_TIMER, jiffies_to_clock_t(res_table->idle_timer)) || nla_put_u32(skb, NHA_RES_GROUP_UNBALANCED_TIMER, jiffies_to_clock_t(res_table->unbalanced_timer)) || nla_put_u64_64bit(skb, NHA_RES_GROUP_UNBALANCED_TIME, nh_res_table_unbalanced_time(res_table), NHA_RES_GROUP_PAD)) goto nla_put_failure; nla_nest_end(skb, nest); return 0; nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nhg) { struct nexthop_grp *p; size_t len = nhg->num_nh * sizeof(*p); struct nlattr *nla; u16 group_type = 0; int i; if (nhg->hash_threshold) group_type = NEXTHOP_GRP_TYPE_MPATH; else if (nhg->resilient) group_type = NEXTHOP_GRP_TYPE_RES; if (nla_put_u16(skb, NHA_GROUP_TYPE, group_type)) goto nla_put_failure; nla = nla_reserve(skb, NHA_GROUP, len); if (!nla) goto nla_put_failure; p = nla_data(nla); for (i = 0; i < nhg->num_nh; ++i) { p->id = nhg->nh_entries[i].nh->id; p->weight = nhg->nh_entries[i].weight - 1; p += 1; } if (nhg->resilient && nla_put_nh_group_res(skb, nhg)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh, int event, u32 portid, u32 seq, unsigned int nlflags) { struct fib6_nh *fib6_nh; struct fib_nh *fib_nh; struct nlmsghdr *nlh; struct nh_info *nhi; struct nhmsg *nhm; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags); if (!nlh) return -EMSGSIZE; nhm = nlmsg_data(nlh); nhm->nh_family = AF_UNSPEC; nhm->nh_flags = nh->nh_flags; nhm->nh_protocol = nh->protocol; nhm->nh_scope = 0; nhm->resvd = 0; if (nla_put_u32(skb, NHA_ID, nh->id)) goto nla_put_failure; if (nh->is_group) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); if (nhg->fdb_nh && nla_put_flag(skb, NHA_FDB)) goto nla_put_failure; if (nla_put_nh_group(skb, nhg)) goto nla_put_failure; goto out; } nhi = rtnl_dereference(nh->nh_info); nhm->nh_family = nhi->family; if (nhi->reject_nh) { if (nla_put_flag(skb, NHA_BLACKHOLE)) goto nla_put_failure; goto out; } else if (nhi->fdb_nh) { if (nla_put_flag(skb, NHA_FDB)) goto nla_put_failure; } else { const struct net_device *dev; dev = nhi->fib_nhc.nhc_dev; if (dev && nla_put_u32(skb, NHA_OIF, dev->ifindex)) goto nla_put_failure; } nhm->nh_scope = nhi->fib_nhc.nhc_scope; switch (nhi->family) { case AF_INET: fib_nh = &nhi->fib_nh; if (fib_nh->fib_nh_gw_family && nla_put_be32(skb, NHA_GATEWAY, fib_nh->fib_nh_gw4)) goto nla_put_failure; break; case AF_INET6: fib6_nh = &nhi->fib6_nh; if (fib6_nh->fib_nh_gw_family && nla_put_in6_addr(skb, NHA_GATEWAY, &fib6_nh->fib_nh_gw6)) goto nla_put_failure; break; } if (nhi->fib_nhc.nhc_lwtstate && lwtunnel_fill_encap(skb, nhi->fib_nhc.nhc_lwtstate, NHA_ENCAP, NHA_ENCAP_TYPE) < 0) goto nla_put_failure; out: nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static size_t nh_nlmsg_size_grp_res(struct nh_group *nhg) { return nla_total_size(0) + /* NHA_RES_GROUP */ nla_total_size(2) + /* NHA_RES_GROUP_BUCKETS */ nla_total_size(4) + /* NHA_RES_GROUP_IDLE_TIMER */ nla_total_size(4) + /* NHA_RES_GROUP_UNBALANCED_TIMER */ nla_total_size_64bit(8);/* NHA_RES_GROUP_UNBALANCED_TIME */ } static size_t nh_nlmsg_size_grp(struct nexthop *nh) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); size_t sz = sizeof(struct nexthop_grp) * nhg->num_nh; size_t tot = nla_total_size(sz) + nla_total_size(2); /* NHA_GROUP_TYPE */ if (nhg->resilient) tot += nh_nlmsg_size_grp_res(nhg); return tot; } static size_t nh_nlmsg_size_single(struct nexthop *nh) { struct nh_info *nhi = rtnl_dereference(nh->nh_info); size_t sz; /* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE * are mutually exclusive */ sz = nla_total_size(4); /* NHA_OIF */ switch (nhi->family) { case AF_INET: if (nhi->fib_nh.fib_nh_gw_family) sz += nla_total_size(4); /* NHA_GATEWAY */ break; case AF_INET6: /* NHA_GATEWAY */ if (nhi->fib6_nh.fib_nh_gw_family) sz += nla_total_size(sizeof(const struct in6_addr)); break; } if (nhi->fib_nhc.nhc_lwtstate) { sz += lwtunnel_get_encap_size(nhi->fib_nhc.nhc_lwtstate); sz += nla_total_size(2); /* NHA_ENCAP_TYPE */ } return sz; } static size_t nh_nlmsg_size(struct nexthop *nh) { size_t sz = NLMSG_ALIGN(sizeof(struct nhmsg)); sz += nla_total_size(4); /* NHA_ID */ if (nh->is_group) sz += nh_nlmsg_size_grp(nh); else sz += nh_nlmsg_size_single(nh); return sz; } static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info) { unsigned int nlflags = info->nlh ? info->nlh->nlmsg_flags : 0; u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(nh_nlmsg_size(nh), gfp_any()); if (!skb) goto errout; err = nh_fill_node(skb, nh, event, info->portid, seq, nlflags); if (err < 0) { /* -EMSGSIZE implies BUG in nh_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_NEXTHOP, info->nlh, gfp_any()); return; errout: if (err < 0) rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err); } static unsigned long nh_res_bucket_used_time(const struct nh_res_bucket *bucket) { return (unsigned long)atomic_long_read(&bucket->used_time); } static unsigned long nh_res_bucket_idle_point(const struct nh_res_table *res_table, const struct nh_res_bucket *bucket, unsigned long now) { unsigned long time = nh_res_bucket_used_time(bucket); /* Bucket was not used since it was migrated. The idle time is now. */ if (time == bucket->migrated_time) return now; return time + res_table->idle_timer; } static unsigned long nh_res_table_unb_point(const struct nh_res_table *res_table) { return res_table->unbalanced_since + res_table->unbalanced_timer; } static void nh_res_bucket_set_idle(const struct nh_res_table *res_table, struct nh_res_bucket *bucket) { unsigned long now = jiffies; atomic_long_set(&bucket->used_time, (long)now); bucket->migrated_time = now; } static void nh_res_bucket_set_busy(struct nh_res_bucket *bucket) { atomic_long_set(&bucket->used_time, (long)jiffies); } static clock_t nh_res_bucket_idle_time(const struct nh_res_bucket *bucket) { unsigned long used_time = nh_res_bucket_used_time(bucket); return jiffies_delta_to_clock_t(jiffies - used_time); } static int nh_fill_res_bucket(struct sk_buff *skb, struct nexthop *nh, struct nh_res_bucket *bucket, u16 bucket_index, int event, u32 portid, u32 seq, unsigned int nlflags, struct netlink_ext_ack *extack) { struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry); struct nlmsghdr *nlh; struct nlattr *nest; struct nhmsg *nhm; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags); if (!nlh) return -EMSGSIZE; nhm = nlmsg_data(nlh); nhm->nh_family = AF_UNSPEC; nhm->nh_flags = bucket->nh_flags; nhm->nh_protocol = nh->protocol; nhm->nh_scope = 0; nhm->resvd = 0; if (nla_put_u32(skb, NHA_ID, nh->id)) goto nla_put_failure; nest = nla_nest_start(skb, NHA_RES_BUCKET); if (!nest) goto nla_put_failure; if (nla_put_u16(skb, NHA_RES_BUCKET_INDEX, bucket_index) || nla_put_u32(skb, NHA_RES_BUCKET_NH_ID, nhge->nh->id) || nla_put_u64_64bit(skb, NHA_RES_BUCKET_IDLE_TIME, nh_res_bucket_idle_time(bucket), NHA_RES_BUCKET_PAD)) goto nla_put_failure_nest; nla_nest_end(skb, nest); nlmsg_end(skb, nlh); return 0; nla_put_failure_nest: nla_nest_cancel(skb, nest); nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static void nexthop_bucket_notify(struct nh_res_table *res_table, u16 bucket_index) { struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index]; struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry); struct nexthop *nh = nhge->nh_parent; struct sk_buff *skb; int err = -ENOBUFS; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) goto errout; err = nh_fill_res_bucket(skb, nh, bucket, bucket_index, RTM_NEWNEXTHOPBUCKET, 0, 0, NLM_F_REPLACE, NULL); if (err < 0) { kfree_skb(skb); goto errout; } rtnl_notify(skb, nh->net, 0, RTNLGRP_NEXTHOP, NULL, GFP_KERNEL); return; errout: if (err < 0) rtnl_set_sk_err(nh->net, RTNLGRP_NEXTHOP, err); } static bool valid_group_nh(struct nexthop *nh, unsigned int npaths, bool *is_fdb, struct netlink_ext_ack *extack) { if (nh->is_group) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); /* Nesting groups within groups is not supported. */ if (nhg->hash_threshold) { NL_SET_ERR_MSG(extack, "Hash-threshold group can not be a nexthop within a group"); return false; } if (nhg->resilient) { NL_SET_ERR_MSG(extack, "Resilient group can not be a nexthop within a group"); return false; } *is_fdb = nhg->fdb_nh; } else { struct nh_info *nhi = rtnl_dereference(nh->nh_info); if (nhi->reject_nh && npaths > 1) { NL_SET_ERR_MSG(extack, "Blackhole nexthop can not be used in a group with more than 1 path"); return false; } *is_fdb = nhi->fdb_nh; } return true; } static int nh_check_attr_fdb_group(struct nexthop *nh, u8 *nh_family, struct netlink_ext_ack *extack) { struct nh_info *nhi; nhi = rtnl_dereference(nh->nh_info); if (!nhi->fdb_nh) { NL_SET_ERR_MSG(extack, "FDB nexthop group can only have fdb nexthops"); return -EINVAL; } if (*nh_family == AF_UNSPEC) { *nh_family = nhi->family; } else if (*nh_family != nhi->family) { NL_SET_ERR_MSG(extack, "FDB nexthop group cannot have mixed family nexthops"); return -EINVAL; } return 0; } static int nh_check_attr_group(struct net *net, struct nlattr *tb[], size_t tb_size, u16 nh_grp_type, struct netlink_ext_ack *extack) { unsigned int len = nla_len(tb[NHA_GROUP]); u8 nh_family = AF_UNSPEC; struct nexthop_grp *nhg; unsigned int i, j; u8 nhg_fdb = 0; if (!len || len & (sizeof(struct nexthop_grp) - 1)) { NL_SET_ERR_MSG(extack, "Invalid length for nexthop group attribute"); return -EINVAL; } /* convert len to number of nexthop ids */ len /= sizeof(*nhg); nhg = nla_data(tb[NHA_GROUP]); for (i = 0; i < len; ++i) { if (nhg[i].resvd1 || nhg[i].resvd2) { NL_SET_ERR_MSG(extack, "Reserved fields in nexthop_grp must be 0"); return -EINVAL; } if (nhg[i].weight > 254) { NL_SET_ERR_MSG(extack, "Invalid value for weight"); return -EINVAL; } for (j = i + 1; j < len; ++j) { if (nhg[i].id == nhg[j].id) { NL_SET_ERR_MSG(extack, "Nexthop id can not be used twice in a group"); return -EINVAL; } } } if (tb[NHA_FDB]) nhg_fdb = 1; nhg = nla_data(tb[NHA_GROUP]); for (i = 0; i < len; ++i) { struct nexthop *nh; bool is_fdb_nh; nh = nexthop_find_by_id(net, nhg[i].id); if (!nh) { NL_SET_ERR_MSG(extack, "Invalid nexthop id"); return -EINVAL; } if (!valid_group_nh(nh, len, &is_fdb_nh, extack)) return -EINVAL; if (nhg_fdb && nh_check_attr_fdb_group(nh, &nh_family, extack)) return -EINVAL; if (!nhg_fdb && is_fdb_nh) { NL_SET_ERR_MSG(extack, "Non FDB nexthop group cannot have fdb nexthops"); return -EINVAL; } } for (i = NHA_GROUP_TYPE + 1; i < tb_size; ++i) { if (!tb[i]) continue; switch (i) { case NHA_FDB: continue; case NHA_RES_GROUP: if (nh_grp_type == NEXTHOP_GRP_TYPE_RES) continue; break; } NL_SET_ERR_MSG(extack, "No other attributes can be set in nexthop groups"); return -EINVAL; } return 0; } static bool ipv6_good_nh(const struct fib6_nh *nh) { int state = NUD_REACHABLE; struct neighbour *n; rcu_read_lock(); n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev, &nh->fib_nh_gw6); if (n) state = READ_ONCE(n->nud_state); rcu_read_unlock(); return !!(state & NUD_VALID); } static bool ipv4_good_nh(const struct fib_nh *nh) { int state = NUD_REACHABLE; struct neighbour *n; rcu_read_lock(); n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev, (__force u32)nh->fib_nh_gw4); if (n) state = READ_ONCE(n->nud_state); rcu_read_unlock(); return !!(state & NUD_VALID); } static bool nexthop_is_good_nh(const struct nexthop *nh) { struct nh_info *nhi = rcu_dereference(nh->nh_info); switch (nhi->family) { case AF_INET: return ipv4_good_nh(&nhi->fib_nh); case AF_INET6: return ipv6_good_nh(&nhi->fib6_nh); } return false; } static struct nexthop *nexthop_select_path_fdb(struct nh_group *nhg, int hash) { int i; for (i = 0; i < nhg->num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; if (hash > atomic_read(&nhge->hthr.upper_bound)) continue; return nhge->nh; } WARN_ON_ONCE(1); return NULL; } static struct nexthop *nexthop_select_path_hthr(struct nh_group *nhg, int hash) { struct nexthop *rc = NULL; int i; if (nhg->fdb_nh) return nexthop_select_path_fdb(nhg, hash); for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; /* nexthops always check if it is good and does * not rely on a sysctl for this behavior */ if (!nexthop_is_good_nh(nhge->nh)) continue; if (!rc) rc = nhge->nh; if (hash > atomic_read(&nhge->hthr.upper_bound)) continue; return nhge->nh; } return rc ? : nhg->nh_entries[0].nh; } static struct nexthop *nexthop_select_path_res(struct nh_group *nhg, int hash) { struct nh_res_table *res_table = rcu_dereference(nhg->res_table); u16 bucket_index = hash % res_table->num_nh_buckets; struct nh_res_bucket *bucket; struct nh_grp_entry *nhge; /* nexthop_select_path() is expected to return a non-NULL value, so * skip protocol validation and just hand out whatever there is. */ bucket = &res_table->nh_buckets[bucket_index]; nh_res_bucket_set_busy(bucket); nhge = rcu_dereference(bucket->nh_entry); return nhge->nh; } struct nexthop *nexthop_select_path(struct nexthop *nh, int hash) { struct nh_group *nhg; if (!nh->is_group) return nh; nhg = rcu_dereference(nh->nh_grp); if (nhg->hash_threshold) return nexthop_select_path_hthr(nhg, hash); else if (nhg->resilient) return nexthop_select_path_res(nhg, hash); /* Unreachable. */ return NULL; } EXPORT_SYMBOL_GPL(nexthop_select_path); int nexthop_for_each_fib6_nh(struct nexthop *nh, int (*cb)(struct fib6_nh *nh, void *arg), void *arg) { struct nh_info *nhi; int err; if (nh->is_group) { struct nh_group *nhg; int i; nhg = rcu_dereference_rtnl(nh->nh_grp); for (i = 0; i < nhg->num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; nhi = rcu_dereference_rtnl(nhge->nh->nh_info); err = cb(&nhi->fib6_nh, arg); if (err) return err; } } else { nhi = rcu_dereference_rtnl(nh->nh_info); err = cb(&nhi->fib6_nh, arg); if (err) return err; } return 0; } EXPORT_SYMBOL_GPL(nexthop_for_each_fib6_nh); static int check_src_addr(const struct in6_addr *saddr, struct netlink_ext_ack *extack) { if (!ipv6_addr_any(saddr)) { NL_SET_ERR_MSG(extack, "IPv6 routes using source address can not use nexthop objects"); return -EINVAL; } return 0; } int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg, struct netlink_ext_ack *extack) { struct nh_info *nhi; bool is_fdb_nh; /* fib6_src is unique to a fib6_info and limits the ability to cache * routes in fib6_nh within a nexthop that is potentially shared * across multiple fib entries. If the config wants to use source * routing it can not use nexthop objects. mlxsw also does not allow * fib6_src on routes. */ if (cfg && check_src_addr(&cfg->fc_src, extack) < 0) return -EINVAL; if (nh->is_group) { struct nh_group *nhg; nhg = rtnl_dereference(nh->nh_grp); if (nhg->has_v4) goto no_v4_nh; is_fdb_nh = nhg->fdb_nh; } else { nhi = rtnl_dereference(nh->nh_info); if (nhi->family == AF_INET) goto no_v4_nh; is_fdb_nh = nhi->fdb_nh; } if (is_fdb_nh) { NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); return -EINVAL; } return 0; no_v4_nh: NL_SET_ERR_MSG(extack, "IPv6 routes can not use an IPv4 nexthop"); return -EINVAL; } EXPORT_SYMBOL_GPL(fib6_check_nexthop); /* if existing nexthop has ipv6 routes linked to it, need * to verify this new spec works with ipv6 */ static int fib6_check_nh_list(struct nexthop *old, struct nexthop *new, struct netlink_ext_ack *extack) { struct fib6_info *f6i; if (list_empty(&old->f6i_list)) return 0; list_for_each_entry(f6i, &old->f6i_list, nh_list) { if (check_src_addr(&f6i->fib6_src.addr, extack) < 0) return -EINVAL; } return fib6_check_nexthop(new, NULL, extack); } static int nexthop_check_scope(struct nh_info *nhi, u8 scope, struct netlink_ext_ack *extack) { if (scope == RT_SCOPE_HOST && nhi->fib_nhc.nhc_gw_family) { NL_SET_ERR_MSG(extack, "Route with host scope can not have a gateway"); return -EINVAL; } if (nhi->fib_nhc.nhc_flags & RTNH_F_ONLINK && scope >= RT_SCOPE_LINK) { NL_SET_ERR_MSG(extack, "Scope mismatch with nexthop"); return -EINVAL; } return 0; } /* Invoked by fib add code to verify nexthop by id is ok with * config for prefix; parts of fib_check_nh not done when nexthop * object is used. */ int fib_check_nexthop(struct nexthop *nh, u8 scope, struct netlink_ext_ack *extack) { struct nh_info *nhi; int err = 0; if (nh->is_group) { struct nh_group *nhg; nhg = rtnl_dereference(nh->nh_grp); if (nhg->fdb_nh) { NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); err = -EINVAL; goto out; } if (scope == RT_SCOPE_HOST) { NL_SET_ERR_MSG(extack, "Route with host scope can not have multiple nexthops"); err = -EINVAL; goto out; } /* all nexthops in a group have the same scope */ nhi = rtnl_dereference(nhg->nh_entries[0].nh->nh_info); err = nexthop_check_scope(nhi, scope, extack); } else { nhi = rtnl_dereference(nh->nh_info); if (nhi->fdb_nh) { NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); err = -EINVAL; goto out; } err = nexthop_check_scope(nhi, scope, extack); } out: return err; } static int fib_check_nh_list(struct nexthop *old, struct nexthop *new, struct netlink_ext_ack *extack) { struct fib_info *fi; list_for_each_entry(fi, &old->fi_list, nh_list) { int err; err = fib_check_nexthop(new, fi->fib_scope, extack); if (err) return err; } return 0; } static bool nh_res_nhge_is_balanced(const struct nh_grp_entry *nhge) { return nhge->res.count_buckets == nhge->res.wants_buckets; } static bool nh_res_nhge_is_ow(const struct nh_grp_entry *nhge) { return nhge->res.count_buckets > nhge->res.wants_buckets; } static bool nh_res_nhge_is_uw(const struct nh_grp_entry *nhge) { return nhge->res.count_buckets < nhge->res.wants_buckets; } static bool nh_res_table_is_balanced(const struct nh_res_table *res_table) { return list_empty(&res_table->uw_nh_entries); } static void nh_res_bucket_unset_nh(struct nh_res_bucket *bucket) { struct nh_grp_entry *nhge; if (bucket->occupied) { nhge = nh_res_dereference(bucket->nh_entry); nhge->res.count_buckets--; bucket->occupied = false; } } static void nh_res_bucket_set_nh(struct nh_res_bucket *bucket, struct nh_grp_entry *nhge) { nh_res_bucket_unset_nh(bucket); bucket->occupied = true; rcu_assign_pointer(bucket->nh_entry, nhge); nhge->res.count_buckets++; } static bool nh_res_bucket_should_migrate(struct nh_res_table *res_table, struct nh_res_bucket *bucket, unsigned long *deadline, bool *force) { unsigned long now = jiffies; struct nh_grp_entry *nhge; unsigned long idle_point; if (!bucket->occupied) { /* The bucket is not occupied, its NHGE pointer is either * NULL or obsolete. We _have to_ migrate: set force. */ *force = true; return true; } nhge = nh_res_dereference(bucket->nh_entry); /* If the bucket is populated by an underweight or balanced * nexthop, do not migrate. */ if (!nh_res_nhge_is_ow(nhge)) return false; /* At this point we know that the bucket is populated with an * overweight nexthop. It needs to be migrated to a new nexthop if * the idle timer of unbalanced timer expired. */ idle_point = nh_res_bucket_idle_point(res_table, bucket, now); if (time_after_eq(now, idle_point)) { /* The bucket is idle. We _can_ migrate: unset force. */ *force = false; return true; } /* Unbalanced timer of 0 means "never force". */ if (res_table->unbalanced_timer) { unsigned long unb_point; unb_point = nh_res_table_unb_point(res_table); if (time_after(now, unb_point)) { /* The bucket is not idle, but the unbalanced timer * expired. We _can_ migrate, but set force anyway, * so that drivers know to ignore activity reports * from the HW. */ *force = true; return true; } nh_res_time_set_deadline(unb_point, deadline); } nh_res_time_set_deadline(idle_point, deadline); return false; } static bool nh_res_bucket_migrate(struct nh_res_table *res_table, u16 bucket_index, bool notify, bool notify_nl, bool force) { struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index]; struct nh_grp_entry *new_nhge; struct netlink_ext_ack extack; int err; new_nhge = list_first_entry_or_null(&res_table->uw_nh_entries, struct nh_grp_entry, res.uw_nh_entry); if (WARN_ON_ONCE(!new_nhge)) /* If this function is called, "bucket" is either not * occupied, or it belongs to a next hop that is * overweight. In either case, there ought to be a * corresponding underweight next hop. */ return false; if (notify) { struct nh_grp_entry *old_nhge; old_nhge = nh_res_dereference(bucket->nh_entry); err = call_nexthop_res_bucket_notifiers(res_table->net, res_table->nhg_id, bucket_index, force, old_nhge->nh, new_nhge->nh, &extack); if (err) { pr_err_ratelimited("%s\n", extack._msg); if (!force) return false; /* It is not possible to veto a forced replacement, so * just clear the hardware flags from the nexthop * bucket to indicate to user space that this bucket is * not correctly populated in hardware. */ bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP); } } nh_res_bucket_set_nh(bucket, new_nhge); nh_res_bucket_set_idle(res_table, bucket); if (notify_nl) nexthop_bucket_notify(res_table, bucket_index); if (nh_res_nhge_is_balanced(new_nhge)) list_del(&new_nhge->res.uw_nh_entry); return true; } #define NH_RES_UPKEEP_DW_MINIMUM_INTERVAL (HZ / 2) static void nh_res_table_upkeep(struct nh_res_table *res_table, bool notify, bool notify_nl) { unsigned long now = jiffies; unsigned long deadline; u16 i; /* Deadline is the next time that upkeep should be run. It is the * earliest time at which one of the buckets might be migrated. * Start at the most pessimistic estimate: either unbalanced_timer * from now, or if there is none, idle_timer from now. For each * encountered time point, call nh_res_time_set_deadline() to * refine the estimate. */ if (res_table->unbalanced_timer) deadline = now + res_table->unbalanced_timer; else deadline = now + res_table->idle_timer; for (i = 0; i < res_table->num_nh_buckets; i++) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; bool force; if (nh_res_bucket_should_migrate(res_table, bucket, &deadline, &force)) { if (!nh_res_bucket_migrate(res_table, i, notify, notify_nl, force)) { unsigned long idle_point; /* A driver can override the migration * decision if the HW reports that the * bucket is actually not idle. Therefore * remark the bucket as busy again and * update the deadline. */ nh_res_bucket_set_busy(bucket); idle_point = nh_res_bucket_idle_point(res_table, bucket, now); nh_res_time_set_deadline(idle_point, &deadline); } } } /* If the group is still unbalanced, schedule the next upkeep to * either the deadline computed above, or the minimum deadline, * whichever comes later. */ if (!nh_res_table_is_balanced(res_table)) { unsigned long now = jiffies; unsigned long min_deadline; min_deadline = now + NH_RES_UPKEEP_DW_MINIMUM_INTERVAL; if (time_before(deadline, min_deadline)) deadline = min_deadline; queue_delayed_work(system_power_efficient_wq, &res_table->upkeep_dw, deadline - now); } } static void nh_res_table_upkeep_dw(struct work_struct *work) { struct delayed_work *dw = to_delayed_work(work); struct nh_res_table *res_table; res_table = container_of(dw, struct nh_res_table, upkeep_dw); nh_res_table_upkeep(res_table, true, true); } static void nh_res_table_cancel_upkeep(struct nh_res_table *res_table) { cancel_delayed_work_sync(&res_table->upkeep_dw); } static void nh_res_group_rebalance(struct nh_group *nhg, struct nh_res_table *res_table) { int prev_upper_bound = 0; int total = 0; int w = 0; int i; INIT_LIST_HEAD(&res_table->uw_nh_entries); for (i = 0; i < nhg->num_nh; ++i) total += nhg->nh_entries[i].weight; for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; int upper_bound; w += nhge->weight; upper_bound = DIV_ROUND_CLOSEST(res_table->num_nh_buckets * w, total); nhge->res.wants_buckets = upper_bound - prev_upper_bound; prev_upper_bound = upper_bound; if (nh_res_nhge_is_uw(nhge)) { if (list_empty(&res_table->uw_nh_entries)) res_table->unbalanced_since = jiffies; list_add(&nhge->res.uw_nh_entry, &res_table->uw_nh_entries); } } } /* Migrate buckets in res_table so that they reference NHGE's from NHG with * the right NH ID. Set those buckets that do not have a corresponding NHGE * entry in NHG as not occupied. */ static void nh_res_table_migrate_buckets(struct nh_res_table *res_table, struct nh_group *nhg) { u16 i; for (i = 0; i < res_table->num_nh_buckets; i++) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; u32 id = rtnl_dereference(bucket->nh_entry)->nh->id; bool found = false; int j; for (j = 0; j < nhg->num_nh; j++) { struct nh_grp_entry *nhge = &nhg->nh_entries[j]; if (nhge->nh->id == id) { nh_res_bucket_set_nh(bucket, nhge); found = true; break; } } if (!found) nh_res_bucket_unset_nh(bucket); } } static void replace_nexthop_grp_res(struct nh_group *oldg, struct nh_group *newg) { /* For NH group replacement, the new NHG might only have a stub * hash table with 0 buckets, because the number of buckets was not * specified. For NH removal, oldg and newg both reference the same * res_table. So in any case, in the following, we want to work * with oldg->res_table. */ struct nh_res_table *old_res_table = rtnl_dereference(oldg->res_table); unsigned long prev_unbalanced_since = old_res_table->unbalanced_since; bool prev_has_uw = !list_empty(&old_res_table->uw_nh_entries); nh_res_table_cancel_upkeep(old_res_table); nh_res_table_migrate_buckets(old_res_table, newg); nh_res_group_rebalance(newg, old_res_table); if (prev_has_uw && !list_empty(&old_res_table->uw_nh_entries)) old_res_table->unbalanced_since = prev_unbalanced_since; nh_res_table_upkeep(old_res_table, true, false); } static void nh_hthr_group_rebalance(struct nh_group *nhg) { int total = 0; int w = 0; int i; for (i = 0; i < nhg->num_nh; ++i) total += nhg->nh_entries[i].weight; for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; int upper_bound; w += nhge->weight; upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1; atomic_set(&nhge->hthr.upper_bound, upper_bound); } } static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, struct nl_info *nlinfo) { struct nh_grp_entry *nhges, *new_nhges; struct nexthop *nhp = nhge->nh_parent; struct netlink_ext_ack extack; struct nexthop *nh = nhge->nh; struct nh_group *nhg, *newg; int i, j, err; WARN_ON(!nh); nhg = rtnl_dereference(nhp->nh_grp); newg = nhg->spare; /* last entry, keep it visible and remove the parent */ if (nhg->num_nh == 1) { remove_nexthop(net, nhp, nlinfo); return; } newg->has_v4 = false; newg->is_multipath = nhg->is_multipath; newg->hash_threshold = nhg->hash_threshold; newg->resilient = nhg->resilient; newg->fdb_nh = nhg->fdb_nh; newg->num_nh = nhg->num_nh; /* copy old entries to new except the one getting removed */ nhges = nhg->nh_entries; new_nhges = newg->nh_entries; for (i = 0, j = 0; i < nhg->num_nh; ++i) { struct nh_info *nhi; /* current nexthop getting removed */ if (nhg->nh_entries[i].nh == nh) { newg->num_nh--; continue; } nhi = rtnl_dereference(nhges[i].nh->nh_info); if (nhi->family == AF_INET) newg->has_v4 = true; list_del(&nhges[i].nh_list); new_nhges[j].nh_parent = nhges[i].nh_parent; new_nhges[j].nh = nhges[i].nh; new_nhges[j].weight = nhges[i].weight; list_add(&new_nhges[j].nh_list, &new_nhges[j].nh->grp_list); j++; } if (newg->hash_threshold) nh_hthr_group_rebalance(newg); else if (newg->resilient) replace_nexthop_grp_res(nhg, newg); rcu_assign_pointer(nhp->nh_grp, newg); list_del(&nhge->nh_list); nexthop_put(nhge->nh); /* Removal of a NH from a resilient group is notified through * bucket notifications. */ if (newg->hash_threshold) { err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp, &extack); if (err) pr_err("%s\n", extack._msg); } if (nlinfo) nexthop_notify(RTM_NEWNEXTHOP, nhp, nlinfo); } static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh, struct nl_info *nlinfo) { struct nh_grp_entry *nhge, *tmp; list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list) remove_nh_grp_entry(net, nhge, nlinfo); /* make sure all see the newly published array before releasing rtnl */ synchronize_net(); } static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo) { struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp); struct nh_res_table *res_table; int i, num_nh = nhg->num_nh; for (i = 0; i < num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; if (WARN_ON(!nhge->nh)) continue; list_del_init(&nhge->nh_list); } if (nhg->resilient) { res_table = rtnl_dereference(nhg->res_table); nh_res_table_cancel_upkeep(res_table); } } /* not called for nexthop replace */ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh) { struct fib6_info *f6i, *tmp; bool do_flush = false; struct fib_info *fi; list_for_each_entry(fi, &nh->fi_list, nh_list) { fi->fib_flags |= RTNH_F_DEAD; do_flush = true; } if (do_flush) fib_flush(net); /* ip6_del_rt removes the entry from this list hence the _safe */ list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) { /* __ip6_del_rt does a release, so do a hold here */ fib6_info_hold(f6i); ipv6_stub->ip6_del_rt(net, f6i, !READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode)); } } static void __remove_nexthop(struct net *net, struct nexthop *nh, struct nl_info *nlinfo) { __remove_nexthop_fib(net, nh); if (nh->is_group) { remove_nexthop_group(nh, nlinfo); } else { struct nh_info *nhi; nhi = rtnl_dereference(nh->nh_info); if (nhi->fib_nhc.nhc_dev) hlist_del(&nhi->dev_hash); remove_nexthop_from_groups(net, nh, nlinfo); } } static void remove_nexthop(struct net *net, struct nexthop *nh, struct nl_info *nlinfo) { call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh, NULL); /* remove from the tree */ rb_erase(&nh->rb_node, &net->nexthop.rb_root); if (nlinfo) nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo); __remove_nexthop(net, nh, nlinfo); nh_base_seq_inc(net); nexthop_put(nh); } /* if any FIB entries reference this nexthop, any dst entries * need to be regenerated */ static void nh_rt_cache_flush(struct net *net, struct nexthop *nh, struct nexthop *replaced_nh) { struct fib6_info *f6i; struct nh_group *nhg; int i; if (!list_empty(&nh->fi_list)) rt_cache_flush(net); list_for_each_entry(f6i, &nh->f6i_list, nh_list) ipv6_stub->fib6_update_sernum(net, f6i); /* if an IPv6 group was replaced, we have to release all old * dsts to make sure all refcounts are released */ if (!replaced_nh->is_group) return; nhg = rtnl_dereference(replaced_nh->nh_grp); for (i = 0; i < nhg->num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; struct nh_info *nhi = rtnl_dereference(nhge->nh->nh_info); if (nhi->family == AF_INET6) ipv6_stub->fib6_nh_release_dsts(&nhi->fib6_nh); } } static int replace_nexthop_grp(struct net *net, struct nexthop *old, struct nexthop *new, const struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nh_res_table *tmp_table = NULL; struct nh_res_table *new_res_table; struct nh_res_table *old_res_table; struct nh_group *oldg, *newg; int i, err; if (!new->is_group) { NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with a nexthop."); return -EINVAL; } oldg = rtnl_dereference(old->nh_grp); newg = rtnl_dereference(new->nh_grp); if (newg->hash_threshold != oldg->hash_threshold) { NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with one of a different type."); return -EINVAL; } if (newg->hash_threshold) { err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack); if (err) return err; } else if (newg->resilient) { new_res_table = rtnl_dereference(newg->res_table); old_res_table = rtnl_dereference(oldg->res_table); /* Accept if num_nh_buckets was not given, but if it was * given, demand that the value be correct. */ if (cfg->nh_grp_res_has_num_buckets && cfg->nh_grp_res_num_buckets != old_res_table->num_nh_buckets) { NL_SET_ERR_MSG(extack, "Can not change number of buckets of a resilient nexthop group."); return -EINVAL; } /* Emit a pre-replace notification so that listeners could veto * a potentially unsupported configuration. Otherwise, * individual bucket replacement notifications would need to be * vetoed, which is something that should only happen if the * bucket is currently active. */ err = call_nexthop_res_table_notifiers(net, new, extack); if (err) return err; if (cfg->nh_grp_res_has_idle_timer) old_res_table->idle_timer = cfg->nh_grp_res_idle_timer; if (cfg->nh_grp_res_has_unbalanced_timer) old_res_table->unbalanced_timer = cfg->nh_grp_res_unbalanced_timer; replace_nexthop_grp_res(oldg, newg); tmp_table = new_res_table; rcu_assign_pointer(newg->res_table, old_res_table); rcu_assign_pointer(newg->spare->res_table, old_res_table); } /* update parents - used by nexthop code for cleanup */ for (i = 0; i < newg->num_nh; i++) newg->nh_entries[i].nh_parent = old; rcu_assign_pointer(old->nh_grp, newg); /* Make sure concurrent readers are not using 'oldg' anymore. */ synchronize_net(); if (newg->resilient) { rcu_assign_pointer(oldg->res_table, tmp_table); rcu_assign_pointer(oldg->spare->res_table, tmp_table); } for (i = 0; i < oldg->num_nh; i++) oldg->nh_entries[i].nh_parent = new; rcu_assign_pointer(new->nh_grp, oldg); return 0; } static void nh_group_v4_update(struct nh_group *nhg) { struct nh_grp_entry *nhges; bool has_v4 = false; int i; nhges = nhg->nh_entries; for (i = 0; i < nhg->num_nh; i++) { struct nh_info *nhi; nhi = rtnl_dereference(nhges[i].nh->nh_info); if (nhi->family == AF_INET) has_v4 = true; } nhg->has_v4 = has_v4; } static int replace_nexthop_single_notify_res(struct net *net, struct nh_res_table *res_table, struct nexthop *old, struct nh_info *oldi, struct nh_info *newi, struct netlink_ext_ack *extack) { u32 nhg_id = res_table->nhg_id; int err; u16 i; for (i = 0; i < res_table->num_nh_buckets; i++) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; struct nh_grp_entry *nhge; nhge = rtnl_dereference(bucket->nh_entry); if (nhge->nh == old) { err = __call_nexthop_res_bucket_notifiers(net, nhg_id, i, true, oldi, newi, extack); if (err) goto err_notify; } } return 0; err_notify: while (i-- > 0) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; struct nh_grp_entry *nhge; nhge = rtnl_dereference(bucket->nh_entry); if (nhge->nh == old) __call_nexthop_res_bucket_notifiers(net, nhg_id, i, true, newi, oldi, extack); } return err; } static int replace_nexthop_single_notify(struct net *net, struct nexthop *group_nh, struct nexthop *old, struct nh_info *oldi, struct nh_info *newi, struct netlink_ext_ack *extack) { struct nh_group *nhg = rtnl_dereference(group_nh->nh_grp); struct nh_res_table *res_table; if (nhg->hash_threshold) { return call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, group_nh, extack); } else if (nhg->resilient) { res_table = rtnl_dereference(nhg->res_table); return replace_nexthop_single_notify_res(net, res_table, old, oldi, newi, extack); } return -EINVAL; } static int replace_nexthop_single(struct net *net, struct nexthop *old, struct nexthop *new, struct netlink_ext_ack *extack) { u8 old_protocol, old_nh_flags; struct nh_info *oldi, *newi; struct nh_grp_entry *nhge; int err; if (new->is_group) { NL_SET_ERR_MSG(extack, "Can not replace a nexthop with a nexthop group."); return -EINVAL; } err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack); if (err) return err; /* Hardware flags were set on 'old' as 'new' is not in the red-black * tree. Therefore, inherit the flags from 'old' to 'new'. */ new->nh_flags |= old->nh_flags & (RTNH_F_OFFLOAD | RTNH_F_TRAP); oldi = rtnl_dereference(old->nh_info); newi = rtnl_dereference(new->nh_info); newi->nh_parent = old; oldi->nh_parent = new; old_protocol = old->protocol; old_nh_flags = old->nh_flags; old->protocol = new->protocol; old->nh_flags = new->nh_flags; rcu_assign_pointer(old->nh_info, newi); rcu_assign_pointer(new->nh_info, oldi); /* Send a replace notification for all the groups using the nexthop. */ list_for_each_entry(nhge, &old->grp_list, nh_list) { struct nexthop *nhp = nhge->nh_parent; err = replace_nexthop_single_notify(net, nhp, old, oldi, newi, extack); if (err) goto err_notify; } /* When replacing an IPv4 nexthop with an IPv6 nexthop, potentially * update IPv4 indication in all the groups using the nexthop. */ if (oldi->family == AF_INET && newi->family == AF_INET6) { list_for_each_entry(nhge, &old->grp_list, nh_list) { struct nexthop *nhp = nhge->nh_parent; struct nh_group *nhg; nhg = rtnl_dereference(nhp->nh_grp); nh_group_v4_update(nhg); } } return 0; err_notify: rcu_assign_pointer(new->nh_info, newi); rcu_assign_pointer(old->nh_info, oldi); old->nh_flags = old_nh_flags; old->protocol = old_protocol; oldi->nh_parent = old; newi->nh_parent = new; list_for_each_entry_continue_reverse(nhge, &old->grp_list, nh_list) { struct nexthop *nhp = nhge->nh_parent; replace_nexthop_single_notify(net, nhp, old, newi, oldi, NULL); } call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, old, extack); return err; } static void __nexthop_replace_notify(struct net *net, struct nexthop *nh, struct nl_info *info) { struct fib6_info *f6i; if (!list_empty(&nh->fi_list)) { struct fib_info *fi; /* expectation is a few fib_info per nexthop and then * a lot of routes per fib_info. So mark the fib_info * and then walk the fib tables once */ list_for_each_entry(fi, &nh->fi_list, nh_list) fi->nh_updated = true; fib_info_notify_update(net, info); list_for_each_entry(fi, &nh->fi_list, nh_list) fi->nh_updated = false; } list_for_each_entry(f6i, &nh->f6i_list, nh_list) ipv6_stub->fib6_rt_update(net, f6i, info); } /* send RTM_NEWROUTE with REPLACE flag set for all FIB entries * linked to this nexthop and for all groups that the nexthop * is a member of */ static void nexthop_replace_notify(struct net *net, struct nexthop *nh, struct nl_info *info) { struct nh_grp_entry *nhge; __nexthop_replace_notify(net, nh, info); list_for_each_entry(nhge, &nh->grp_list, nh_list) __nexthop_replace_notify(net, nhge->nh_parent, info); } static int replace_nexthop(struct net *net, struct nexthop *old, struct nexthop *new, const struct nh_config *cfg, struct netlink_ext_ack *extack) { bool new_is_reject = false; struct nh_grp_entry *nhge; int err; /* check that existing FIB entries are ok with the * new nexthop definition */ err = fib_check_nh_list(old, new, extack); if (err) return err; err = fib6_check_nh_list(old, new, extack); if (err) return err; if (!new->is_group) { struct nh_info *nhi = rtnl_dereference(new->nh_info); new_is_reject = nhi->reject_nh; } list_for_each_entry(nhge, &old->grp_list, nh_list) { /* if new nexthop is a blackhole, any groups using this * nexthop cannot have more than 1 path */ if (new_is_reject && nexthop_num_path(nhge->nh_parent) > 1) { NL_SET_ERR_MSG(extack, "Blackhole nexthop can not be a member of a group with more than one path"); return -EINVAL; } err = fib_check_nh_list(nhge->nh_parent, new, extack); if (err) return err; err = fib6_check_nh_list(nhge->nh_parent, new, extack); if (err) return err; } if (old->is_group) err = replace_nexthop_grp(net, old, new, cfg, extack); else err = replace_nexthop_single(net, old, new, extack); if (!err) { nh_rt_cache_flush(net, old, new); __remove_nexthop(net, new, NULL); nexthop_put(new); } return err; } /* called with rtnl_lock held */ static int insert_nexthop(struct net *net, struct nexthop *new_nh, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct rb_node **pp, *parent = NULL, *next; struct rb_root *root = &net->nexthop.rb_root; bool replace = !!(cfg->nlflags & NLM_F_REPLACE); bool create = !!(cfg->nlflags & NLM_F_CREATE); u32 new_id = new_nh->id; int replace_notify = 0; int rc = -EEXIST; pp = &root->rb_node; while (1) { struct nexthop *nh; next = *pp; if (!next) break; parent = next; nh = rb_entry(parent, struct nexthop, rb_node); if (new_id < nh->id) { pp = &next->rb_left; } else if (new_id > nh->id) { pp = &next->rb_right; } else if (replace) { rc = replace_nexthop(net, nh, new_nh, cfg, extack); if (!rc) { new_nh = nh; /* send notification with old nh */ replace_notify = 1; } goto out; } else { /* id already exists and not a replace */ goto out; } } if (replace && !create) { NL_SET_ERR_MSG(extack, "Replace specified without create and no entry exists"); rc = -ENOENT; goto out; } if (new_nh->is_group) { struct nh_group *nhg = rtnl_dereference(new_nh->nh_grp); struct nh_res_table *res_table; if (nhg->resilient) { res_table = rtnl_dereference(nhg->res_table); /* Not passing the number of buckets is OK when * replacing, but not when creating a new group. */ if (!cfg->nh_grp_res_has_num_buckets) { NL_SET_ERR_MSG(extack, "Number of buckets not specified for nexthop group insertion"); rc = -EINVAL; goto out; } nh_res_group_rebalance(nhg, res_table); /* Do not send bucket notifications, we do full * notification below. */ nh_res_table_upkeep(res_table, false, false); } } rb_link_node_rcu(&new_nh->rb_node, parent, pp); rb_insert_color(&new_nh->rb_node, root); /* The initial insertion is a full notification for hash-threshold as * well as resilient groups. */ rc = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new_nh, extack); if (rc) rb_erase(&new_nh->rb_node, &net->nexthop.rb_root); out: if (!rc) { nh_base_seq_inc(net); nexthop_notify(RTM_NEWNEXTHOP, new_nh, &cfg->nlinfo); if (replace_notify && READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode)) nexthop_replace_notify(net, new_nh, &cfg->nlinfo); } return rc; } /* rtnl */ /* remove all nexthops tied to a device being deleted */ static void nexthop_flush_dev(struct net_device *dev, unsigned long event) { unsigned int hash = nh_dev_hashfn(dev->ifindex); struct net *net = dev_net(dev); struct hlist_head *head = &net->nexthop.devhash[hash]; struct hlist_node *n; struct nh_info *nhi; hlist_for_each_entry_safe(nhi, n, head, dev_hash) { if (nhi->fib_nhc.nhc_dev != dev) continue; if (nhi->reject_nh && (event == NETDEV_DOWN || event == NETDEV_CHANGE)) continue; remove_nexthop(net, nhi->nh_parent, NULL); } } /* rtnl; called when net namespace is deleted */ static void flush_all_nexthops(struct net *net) { struct rb_root *root = &net->nexthop.rb_root; struct rb_node *node; struct nexthop *nh; while ((node = rb_first(root))) { nh = rb_entry(node, struct nexthop, rb_node); remove_nexthop(net, nh, NULL); cond_resched(); } } static struct nexthop *nexthop_create_group(struct net *net, struct nh_config *cfg) { struct nlattr *grps_attr = cfg->nh_grp; struct nexthop_grp *entry = nla_data(grps_attr); u16 num_nh = nla_len(grps_attr) / sizeof(*entry); struct nh_group *nhg; struct nexthop *nh; int err; int i; if (WARN_ON(!num_nh)) return ERR_PTR(-EINVAL); nh = nexthop_alloc(); if (!nh) return ERR_PTR(-ENOMEM); nh->is_group = 1; nhg = nexthop_grp_alloc(num_nh); if (!nhg) { kfree(nh); return ERR_PTR(-ENOMEM); } /* spare group used for removals */ nhg->spare = nexthop_grp_alloc(num_nh); if (!nhg->spare) { kfree(nhg); kfree(nh); return ERR_PTR(-ENOMEM); } nhg->spare->spare = nhg; for (i = 0; i < nhg->num_nh; ++i) { struct nexthop *nhe; struct nh_info *nhi; nhe = nexthop_find_by_id(net, entry[i].id); if (!nexthop_get(nhe)) { err = -ENOENT; goto out_no_nh; } nhi = rtnl_dereference(nhe->nh_info); if (nhi->family == AF_INET) nhg->has_v4 = true; nhg->nh_entries[i].nh = nhe; nhg->nh_entries[i].weight = entry[i].weight + 1; list_add(&nhg->nh_entries[i].nh_list, &nhe->grp_list); nhg->nh_entries[i].nh_parent = nh; } if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) { nhg->hash_threshold = 1; nhg->is_multipath = true; } else if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES) { struct nh_res_table *res_table; res_table = nexthop_res_table_alloc(net, cfg->nh_id, cfg); if (!res_table) { err = -ENOMEM; goto out_no_nh; } rcu_assign_pointer(nhg->spare->res_table, res_table); rcu_assign_pointer(nhg->res_table, res_table); nhg->resilient = true; nhg->is_multipath = true; } WARN_ON_ONCE(nhg->hash_threshold + nhg->resilient != 1); if (nhg->hash_threshold) nh_hthr_group_rebalance(nhg); if (cfg->nh_fdb) nhg->fdb_nh = 1; rcu_assign_pointer(nh->nh_grp, nhg); return nh; out_no_nh: for (i--; i >= 0; --i) { list_del(&nhg->nh_entries[i].nh_list); nexthop_put(nhg->nh_entries[i].nh); } kfree(nhg->spare); kfree(nhg); kfree(nh); return ERR_PTR(err); } static int nh_create_ipv4(struct net *net, struct nexthop *nh, struct nh_info *nhi, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct fib_nh *fib_nh = &nhi->fib_nh; struct fib_config fib_cfg = { .fc_oif = cfg->nh_ifindex, .fc_gw4 = cfg->gw.ipv4, .fc_gw_family = cfg->gw.ipv4 ? AF_INET : 0, .fc_flags = cfg->nh_flags, .fc_nlinfo = cfg->nlinfo, .fc_encap = cfg->nh_encap, .fc_encap_type = cfg->nh_encap_type, }; u32 tb_id = (cfg->dev ? l3mdev_fib_table(cfg->dev) : RT_TABLE_MAIN); int err; err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack); if (err) { fib_nh_release(net, fib_nh); goto out; } if (nhi->fdb_nh) goto out; /* sets nh_dev if successful */ err = fib_check_nh(net, fib_nh, tb_id, 0, extack); if (!err) { nh->nh_flags = fib_nh->fib_nh_flags; fib_info_update_nhc_saddr(net, &fib_nh->nh_common, !fib_nh->fib_nh_scope ? 0 : fib_nh->fib_nh_scope - 1); } else { fib_nh_release(net, fib_nh); } out: return err; } static int nh_create_ipv6(struct net *net, struct nexthop *nh, struct nh_info *nhi, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct fib6_nh *fib6_nh = &nhi->fib6_nh; struct fib6_config fib6_cfg = { .fc_table = l3mdev_fib_table(cfg->dev), .fc_ifindex = cfg->nh_ifindex, .fc_gateway = cfg->gw.ipv6, .fc_flags = cfg->nh_flags, .fc_nlinfo = cfg->nlinfo, .fc_encap = cfg->nh_encap, .fc_encap_type = cfg->nh_encap_type, .fc_is_fdb = cfg->nh_fdb, }; int err; if (!ipv6_addr_any(&cfg->gw.ipv6)) fib6_cfg.fc_flags |= RTF_GATEWAY; /* sets nh_dev if successful */ err = ipv6_stub->fib6_nh_init(net, fib6_nh, &fib6_cfg, GFP_KERNEL, extack); if (err) { /* IPv6 is not enabled, don't call fib6_nh_release */ if (err == -EAFNOSUPPORT) goto out; ipv6_stub->fib6_nh_release(fib6_nh); } else { nh->nh_flags = fib6_nh->fib_nh_flags; } out: return err; } static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nh_info *nhi; struct nexthop *nh; int err = 0; nh = nexthop_alloc(); if (!nh) return ERR_PTR(-ENOMEM); nhi = kzalloc(sizeof(*nhi), GFP_KERNEL); if (!nhi) { kfree(nh); return ERR_PTR(-ENOMEM); } nh->nh_flags = cfg->nh_flags; nh->net = net; nhi->nh_parent = nh; nhi->family = cfg->nh_family; nhi->fib_nhc.nhc_scope = RT_SCOPE_LINK; if (cfg->nh_fdb) nhi->fdb_nh = 1; if (cfg->nh_blackhole) { nhi->reject_nh = 1; cfg->nh_ifindex = net->loopback_dev->ifindex; } switch (cfg->nh_family) { case AF_INET: err = nh_create_ipv4(net, nh, nhi, cfg, extack); break; case AF_INET6: err = nh_create_ipv6(net, nh, nhi, cfg, extack); break; } if (err) { kfree(nhi); kfree(nh); return ERR_PTR(err); } /* add the entry to the device based hash */ if (!nhi->fdb_nh) nexthop_devhash_add(net, nhi); rcu_assign_pointer(nh->nh_info, nhi); return nh; } /* called with rtnl lock held */ static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nexthop *nh; int err; if (cfg->nlflags & NLM_F_REPLACE && !cfg->nh_id) { NL_SET_ERR_MSG(extack, "Replace requires nexthop id"); return ERR_PTR(-EINVAL); } if (!cfg->nh_id) { cfg->nh_id = nh_find_unused_id(net); if (!cfg->nh_id) { NL_SET_ERR_MSG(extack, "No unused id"); return ERR_PTR(-EINVAL); } } if (cfg->nh_grp) nh = nexthop_create_group(net, cfg); else nh = nexthop_create(net, cfg, extack); if (IS_ERR(nh)) return nh; refcount_set(&nh->refcnt, 1); nh->id = cfg->nh_id; nh->protocol = cfg->nh_protocol; nh->net = net; err = insert_nexthop(net, nh, cfg, extack); if (err) { __remove_nexthop(net, nh, NULL); nexthop_put(nh); nh = ERR_PTR(err); } return nh; } static int rtm_nh_get_timer(struct nlattr *attr, unsigned long fallback, unsigned long *timer_p, bool *has_p, struct netlink_ext_ack *extack) { unsigned long timer; u32 value; if (!attr) { *timer_p = fallback; *has_p = false; return 0; } value = nla_get_u32(attr); timer = clock_t_to_jiffies(value); if (timer == ~0UL) { NL_SET_ERR_MSG(extack, "Timer value too large"); return -EINVAL; } *timer_p = timer; *has_p = true; return 0; } static int rtm_to_nh_config_grp_res(struct nlattr *res, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_policy_new)] = {}; int err; if (res) { err = nla_parse_nested(tb, ARRAY_SIZE(rtm_nh_res_policy_new) - 1, res, rtm_nh_res_policy_new, extack); if (err < 0) return err; } if (tb[NHA_RES_GROUP_BUCKETS]) { cfg->nh_grp_res_num_buckets = nla_get_u16(tb[NHA_RES_GROUP_BUCKETS]); cfg->nh_grp_res_has_num_buckets = true; if (!cfg->nh_grp_res_num_buckets) { NL_SET_ERR_MSG(extack, "Number of buckets needs to be non-0"); return -EINVAL; } } err = rtm_nh_get_timer(tb[NHA_RES_GROUP_IDLE_TIMER], NH_RES_DEFAULT_IDLE_TIMER, &cfg->nh_grp_res_idle_timer, &cfg->nh_grp_res_has_idle_timer, extack); if (err) return err; return rtm_nh_get_timer(tb[NHA_RES_GROUP_UNBALANCED_TIMER], NH_RES_DEFAULT_UNBALANCED_TIMER, &cfg->nh_grp_res_unbalanced_timer, &cfg->nh_grp_res_has_unbalanced_timer, extack); } static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nhmsg *nhm = nlmsg_data(nlh); struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_new)]; int err; err = nlmsg_parse(nlh, sizeof(*nhm), tb, ARRAY_SIZE(rtm_nh_policy_new) - 1, rtm_nh_policy_new, extack); if (err < 0) return err; err = -EINVAL; if (nhm->resvd || nhm->nh_scope) { NL_SET_ERR_MSG(extack, "Invalid values in ancillary header"); goto out; } if (nhm->nh_flags & ~NEXTHOP_VALID_USER_FLAGS) { NL_SET_ERR_MSG(extack, "Invalid nexthop flags in ancillary header"); goto out; } switch (nhm->nh_family) { case AF_INET: case AF_INET6: break; case AF_UNSPEC: if (tb[NHA_GROUP]) break; fallthrough; default: NL_SET_ERR_MSG(extack, "Invalid address family"); goto out; } memset(cfg, 0, sizeof(*cfg)); cfg->nlflags = nlh->nlmsg_flags; cfg->nlinfo.portid = NETLINK_CB(skb).portid; cfg->nlinfo.nlh = nlh; cfg->nlinfo.nl_net = net; cfg->nh_family = nhm->nh_family; cfg->nh_protocol = nhm->nh_protocol; cfg->nh_flags = nhm->nh_flags; if (tb[NHA_ID]) cfg->nh_id = nla_get_u32(tb[NHA_ID]); if (tb[NHA_FDB]) { if (tb[NHA_OIF] || tb[NHA_BLACKHOLE] || tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE]) { NL_SET_ERR_MSG(extack, "Fdb attribute can not be used with encap, oif or blackhole"); goto out; } if (nhm->nh_flags) { NL_SET_ERR_MSG(extack, "Unsupported nexthop flags in ancillary header"); goto out; } cfg->nh_fdb = nla_get_flag(tb[NHA_FDB]); } if (tb[NHA_GROUP]) { if (nhm->nh_family != AF_UNSPEC) { NL_SET_ERR_MSG(extack, "Invalid family for group"); goto out; } cfg->nh_grp = tb[NHA_GROUP]; cfg->nh_grp_type = NEXTHOP_GRP_TYPE_MPATH; if (tb[NHA_GROUP_TYPE]) cfg->nh_grp_type = nla_get_u16(tb[NHA_GROUP_TYPE]); if (cfg->nh_grp_type > NEXTHOP_GRP_TYPE_MAX) { NL_SET_ERR_MSG(extack, "Invalid group type"); goto out; } err = nh_check_attr_group(net, tb, ARRAY_SIZE(tb), cfg->nh_grp_type, extack); if (err) goto out; if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES) err = rtm_to_nh_config_grp_res(tb[NHA_RES_GROUP], cfg, extack); /* no other attributes should be set */ goto out; } if (tb[NHA_BLACKHOLE]) { if (tb[NHA_GATEWAY] || tb[NHA_OIF] || tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE] || tb[NHA_FDB]) { NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway, oif, encap or fdb"); goto out; } cfg->nh_blackhole = 1; err = 0; goto out; } if (!cfg->nh_fdb && !tb[NHA_OIF]) { NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole and non-fdb nexthops"); goto out; } if (!cfg->nh_fdb && tb[NHA_OIF]) { cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]); if (cfg->nh_ifindex) cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex); if (!cfg->dev) { NL_SET_ERR_MSG(extack, "Invalid device index"); goto out; } else if (!(cfg->dev->flags & IFF_UP)) { NL_SET_ERR_MSG(extack, "Nexthop device is not up"); err = -ENETDOWN; goto out; } else if (!netif_carrier_ok(cfg->dev)) { NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down"); err = -ENETDOWN; goto out; } } err = -EINVAL; if (tb[NHA_GATEWAY]) { struct nlattr *gwa = tb[NHA_GATEWAY]; switch (cfg->nh_family) { case AF_INET: if (nla_len(gwa) != sizeof(u32)) { NL_SET_ERR_MSG(extack, "Invalid gateway"); goto out; } cfg->gw.ipv4 = nla_get_be32(gwa); break; case AF_INET6: if (nla_len(gwa) != sizeof(struct in6_addr)) { NL_SET_ERR_MSG(extack, "Invalid gateway"); goto out; } cfg->gw.ipv6 = nla_get_in6_addr(gwa); break; default: NL_SET_ERR_MSG(extack, "Unknown address family for gateway"); goto out; } } else { /* device only nexthop (no gateway) */ if (cfg->nh_flags & RTNH_F_ONLINK) { NL_SET_ERR_MSG(extack, "ONLINK flag can not be set for nexthop without a gateway"); goto out; } } if (tb[NHA_ENCAP]) { cfg->nh_encap = tb[NHA_ENCAP]; if (!tb[NHA_ENCAP_TYPE]) { NL_SET_ERR_MSG(extack, "LWT encapsulation type is missing"); goto out; } cfg->nh_encap_type = nla_get_u16(tb[NHA_ENCAP_TYPE]); err = lwtunnel_valid_encap_type(cfg->nh_encap_type, extack); if (err < 0) goto out; } else if (tb[NHA_ENCAP_TYPE]) { NL_SET_ERR_MSG(extack, "LWT encapsulation attribute is missing"); goto out; } err = 0; out: return err; } /* rtnl */ static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nh_config cfg; struct nexthop *nh; int err; err = rtm_to_nh_config(net, skb, nlh, &cfg, extack); if (!err) { nh = nexthop_add(net, &cfg, extack); if (IS_ERR(nh)) err = PTR_ERR(nh); } return err; } static int __nh_valid_get_del_req(const struct nlmsghdr *nlh, struct nlattr **tb, u32 *id, struct netlink_ext_ack *extack) { struct nhmsg *nhm = nlmsg_data(nlh); if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) { NL_SET_ERR_MSG(extack, "Invalid values in header"); return -EINVAL; } if (!tb[NHA_ID]) { NL_SET_ERR_MSG(extack, "Nexthop id is missing"); return -EINVAL; } *id = nla_get_u32(tb[NHA_ID]); if (!(*id)) { NL_SET_ERR_MSG(extack, "Invalid nexthop id"); return -EINVAL; } return 0; } static int nh_valid_get_del_req(const struct nlmsghdr *nlh, u32 *id, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get)]; int err; err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, ARRAY_SIZE(rtm_nh_policy_get) - 1, rtm_nh_policy_get, extack); if (err < 0) return err; return __nh_valid_get_del_req(nlh, tb, id, extack); } /* rtnl */ static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nl_info nlinfo = { .nlh = nlh, .nl_net = net, .portid = NETLINK_CB(skb).portid, }; struct nexthop *nh; int err; u32 id; err = nh_valid_get_del_req(nlh, &id, extack); if (err) return err; nh = nexthop_find_by_id(net, id); if (!nh) return -ENOENT; remove_nexthop(net, nh, &nlinfo); return 0; } /* rtnl */ static int rtm_get_nexthop(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct sk_buff *skb = NULL; struct nexthop *nh; int err; u32 id; err = nh_valid_get_del_req(nlh, &id, extack); if (err) return err; err = -ENOBUFS; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) goto out; err = -ENOENT; nh = nexthop_find_by_id(net, id); if (!nh) goto errout_free; err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0); if (err < 0) { WARN_ON(err == -EMSGSIZE); goto errout_free; } err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); out: return err; errout_free: kfree_skb(skb); goto out; } struct nh_dump_filter { u32 nh_id; int dev_idx; int master_idx; bool group_filter; bool fdb_filter; u32 res_bucket_nh_id; }; static bool nh_dump_filtered(struct nexthop *nh, struct nh_dump_filter *filter, u8 family) { const struct net_device *dev; const struct nh_info *nhi; if (filter->group_filter && !nh->is_group) return true; if (!filter->dev_idx && !filter->master_idx && !family) return false; if (nh->is_group) return true; nhi = rtnl_dereference(nh->nh_info); if (family && nhi->family != family) return true; dev = nhi->fib_nhc.nhc_dev; if (filter->dev_idx && (!dev || dev->ifindex != filter->dev_idx)) return true; if (filter->master_idx) { struct net_device *master; if (!dev) return true; master = netdev_master_upper_dev_get((struct net_device *)dev); if (!master || master->ifindex != filter->master_idx) return true; } return false; } static int __nh_valid_dump_req(const struct nlmsghdr *nlh, struct nlattr **tb, struct nh_dump_filter *filter, struct netlink_ext_ack *extack) { struct nhmsg *nhm; u32 idx; if (tb[NHA_OIF]) { idx = nla_get_u32(tb[NHA_OIF]); if (idx > INT_MAX) { NL_SET_ERR_MSG(extack, "Invalid device index"); return -EINVAL; } filter->dev_idx = idx; } if (tb[NHA_MASTER]) { idx = nla_get_u32(tb[NHA_MASTER]); if (idx > INT_MAX) { NL_SET_ERR_MSG(extack, "Invalid master device index"); return -EINVAL; } filter->master_idx = idx; } filter->group_filter = nla_get_flag(tb[NHA_GROUPS]); filter->fdb_filter = nla_get_flag(tb[NHA_FDB]); nhm = nlmsg_data(nlh); if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) { NL_SET_ERR_MSG(extack, "Invalid values in header for nexthop dump request"); return -EINVAL; } return 0; } static int nh_valid_dump_req(const struct nlmsghdr *nlh, struct nh_dump_filter *filter, struct netlink_callback *cb) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_dump)]; int err; err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, ARRAY_SIZE(rtm_nh_policy_dump) - 1, rtm_nh_policy_dump, cb->extack); if (err < 0) return err; return __nh_valid_dump_req(nlh, tb, filter, cb->extack); } struct rtm_dump_nh_ctx { u32 idx; }; static struct rtm_dump_nh_ctx * rtm_dump_nh_ctx(struct netlink_callback *cb) { struct rtm_dump_nh_ctx *ctx = (void *)cb->ctx; BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); return ctx; } static int rtm_dump_walk_nexthops(struct sk_buff *skb, struct netlink_callback *cb, struct rb_root *root, struct rtm_dump_nh_ctx *ctx, int (*nh_cb)(struct sk_buff *skb, struct netlink_callback *cb, struct nexthop *nh, void *data), void *data) { struct rb_node *node; int s_idx; int err; s_idx = ctx->idx; for (node = rb_first(root); node; node = rb_next(node)) { struct nexthop *nh; nh = rb_entry(node, struct nexthop, rb_node); if (nh->id < s_idx) continue; ctx->idx = nh->id; err = nh_cb(skb, cb, nh, data); if (err) return err; } return 0; } static int rtm_dump_nexthop_cb(struct sk_buff *skb, struct netlink_callback *cb, struct nexthop *nh, void *data) { struct nhmsg *nhm = nlmsg_data(cb->nlh); struct nh_dump_filter *filter = data; if (nh_dump_filtered(nh, filter, nhm->nh_family)) return 0; return nh_fill_node(skb, nh, RTM_NEWNEXTHOP, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI); } /* rtnl */ static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb) { struct rtm_dump_nh_ctx *ctx = rtm_dump_nh_ctx(cb); struct net *net = sock_net(skb->sk); struct rb_root *root = &net->nexthop.rb_root; struct nh_dump_filter filter = {}; int err; err = nh_valid_dump_req(cb->nlh, &filter, cb); if (err < 0) return err; err = rtm_dump_walk_nexthops(skb, cb, root, ctx, &rtm_dump_nexthop_cb, &filter); if (err < 0) { if (likely(skb->len)) err = skb->len; } cb->seq = net->nexthop.seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); return err; } static struct nexthop * nexthop_find_group_resilient(struct net *net, u32 id, struct netlink_ext_ack *extack) { struct nh_group *nhg; struct nexthop *nh; nh = nexthop_find_by_id(net, id); if (!nh) return ERR_PTR(-ENOENT); if (!nh->is_group) { NL_SET_ERR_MSG(extack, "Not a nexthop group"); return ERR_PTR(-EINVAL); } nhg = rtnl_dereference(nh->nh_grp); if (!nhg->resilient) { NL_SET_ERR_MSG(extack, "Nexthop group not of type resilient"); return ERR_PTR(-EINVAL); } return nh; } static int nh_valid_dump_nhid(struct nlattr *attr, u32 *nh_id_p, struct netlink_ext_ack *extack) { u32 idx; if (attr) { idx = nla_get_u32(attr); if (!idx) { NL_SET_ERR_MSG(extack, "Invalid nexthop id"); return -EINVAL; } *nh_id_p = idx; } else { *nh_id_p = 0; } return 0; } static int nh_valid_dump_bucket_req(const struct nlmsghdr *nlh, struct nh_dump_filter *filter, struct netlink_callback *cb) { struct nlattr *res_tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_dump)]; struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_dump_bucket)]; int err; err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, ARRAY_SIZE(rtm_nh_policy_dump_bucket) - 1, rtm_nh_policy_dump_bucket, NULL); if (err < 0) return err; err = nh_valid_dump_nhid(tb[NHA_ID], &filter->nh_id, cb->extack); if (err) return err; if (tb[NHA_RES_BUCKET]) { size_t max = ARRAY_SIZE(rtm_nh_res_bucket_policy_dump) - 1; err = nla_parse_nested(res_tb, max, tb[NHA_RES_BUCKET], rtm_nh_res_bucket_policy_dump, cb->extack); if (err < 0) return err; err = nh_valid_dump_nhid(res_tb[NHA_RES_BUCKET_NH_ID], &filter->res_bucket_nh_id, cb->extack); if (err) return err; } return __nh_valid_dump_req(nlh, tb, filter, cb->extack); } struct rtm_dump_res_bucket_ctx { struct rtm_dump_nh_ctx nh; u16 bucket_index; }; static struct rtm_dump_res_bucket_ctx * rtm_dump_res_bucket_ctx(struct netlink_callback *cb) { struct rtm_dump_res_bucket_ctx *ctx = (void *)cb->ctx; BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); return ctx; } struct rtm_dump_nexthop_bucket_data { struct rtm_dump_res_bucket_ctx *ctx; struct nh_dump_filter filter; }; static int rtm_dump_nexthop_bucket_nh(struct sk_buff *skb, struct netlink_callback *cb, struct nexthop *nh, struct rtm_dump_nexthop_bucket_data *dd) { u32 portid = NETLINK_CB(cb->skb).portid; struct nhmsg *nhm = nlmsg_data(cb->nlh); struct nh_res_table *res_table; struct nh_group *nhg; u16 bucket_index; int err; nhg = rtnl_dereference(nh->nh_grp); res_table = rtnl_dereference(nhg->res_table); for (bucket_index = dd->ctx->bucket_index; bucket_index < res_table->num_nh_buckets; bucket_index++) { struct nh_res_bucket *bucket; struct nh_grp_entry *nhge; bucket = &res_table->nh_buckets[bucket_index]; nhge = rtnl_dereference(bucket->nh_entry); if (nh_dump_filtered(nhge->nh, &dd->filter, nhm->nh_family)) continue; if (dd->filter.res_bucket_nh_id && dd->filter.res_bucket_nh_id != nhge->nh->id) continue; dd->ctx->bucket_index = bucket_index; err = nh_fill_res_bucket(skb, nh, bucket, bucket_index, RTM_NEWNEXTHOPBUCKET, portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->extack); if (err) return err; } dd->ctx->bucket_index = 0; return 0; } static int rtm_dump_nexthop_bucket_cb(struct sk_buff *skb, struct netlink_callback *cb, struct nexthop *nh, void *data) { struct rtm_dump_nexthop_bucket_data *dd = data; struct nh_group *nhg; if (!nh->is_group) return 0; nhg = rtnl_dereference(nh->nh_grp); if (!nhg->resilient) return 0; return rtm_dump_nexthop_bucket_nh(skb, cb, nh, dd); } /* rtnl */ static int rtm_dump_nexthop_bucket(struct sk_buff *skb, struct netlink_callback *cb) { struct rtm_dump_res_bucket_ctx *ctx = rtm_dump_res_bucket_ctx(cb); struct rtm_dump_nexthop_bucket_data dd = { .ctx = ctx }; struct net *net = sock_net(skb->sk); struct nexthop *nh; int err; err = nh_valid_dump_bucket_req(cb->nlh, &dd.filter, cb); if (err) return err; if (dd.filter.nh_id) { nh = nexthop_find_group_resilient(net, dd.filter.nh_id, cb->extack); if (IS_ERR(nh)) return PTR_ERR(nh); err = rtm_dump_nexthop_bucket_nh(skb, cb, nh, &dd); } else { struct rb_root *root = &net->nexthop.rb_root; err = rtm_dump_walk_nexthops(skb, cb, root, &ctx->nh, &rtm_dump_nexthop_bucket_cb, &dd); } if (err < 0) { if (likely(skb->len)) err = skb->len; } cb->seq = net->nexthop.seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); return err; } static int nh_valid_get_bucket_req_res_bucket(struct nlattr *res, u16 *bucket_index, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_get)]; int err; err = nla_parse_nested(tb, ARRAY_SIZE(rtm_nh_res_bucket_policy_get) - 1, res, rtm_nh_res_bucket_policy_get, extack); if (err < 0) return err; if (!tb[NHA_RES_BUCKET_INDEX]) { NL_SET_ERR_MSG(extack, "Bucket index is missing"); return -EINVAL; } *bucket_index = nla_get_u16(tb[NHA_RES_BUCKET_INDEX]); return 0; } static int nh_valid_get_bucket_req(const struct nlmsghdr *nlh, u32 *id, u16 *bucket_index, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get_bucket)]; int err; err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, ARRAY_SIZE(rtm_nh_policy_get_bucket) - 1, rtm_nh_policy_get_bucket, extack); if (err < 0) return err; err = __nh_valid_get_del_req(nlh, tb, id, extack); if (err) return err; if (!tb[NHA_RES_BUCKET]) { NL_SET_ERR_MSG(extack, "Bucket information is missing"); return -EINVAL; } err = nh_valid_get_bucket_req_res_bucket(tb[NHA_RES_BUCKET], bucket_index, extack); if (err) return err; return 0; } /* rtnl */ static int rtm_get_nexthop_bucket(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct nh_res_table *res_table; struct sk_buff *skb = NULL; struct nh_group *nhg; struct nexthop *nh; u16 bucket_index; int err; u32 id; err = nh_valid_get_bucket_req(nlh, &id, &bucket_index, extack); if (err) return err; nh = nexthop_find_group_resilient(net, id, extack); if (IS_ERR(nh)) return PTR_ERR(nh); nhg = rtnl_dereference(nh->nh_grp); res_table = rtnl_dereference(nhg->res_table); if (bucket_index >= res_table->num_nh_buckets) { NL_SET_ERR_MSG(extack, "Bucket index out of bounds"); return -ENOENT; } skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; err = nh_fill_res_bucket(skb, nh, &res_table->nh_buckets[bucket_index], bucket_index, RTM_NEWNEXTHOPBUCKET, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0, extack); if (err < 0) { WARN_ON(err == -EMSGSIZE); goto errout_free; } return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout_free: kfree_skb(skb); return err; } static void nexthop_sync_mtu(struct net_device *dev, u32 orig_mtu) { unsigned int hash = nh_dev_hashfn(dev->ifindex); struct net *net = dev_net(dev); struct hlist_head *head = &net->nexthop.devhash[hash]; struct hlist_node *n; struct nh_info *nhi; hlist_for_each_entry_safe(nhi, n, head, dev_hash) { if (nhi->fib_nhc.nhc_dev == dev) { if (nhi->family == AF_INET) fib_nhc_update_mtu(&nhi->fib_nhc, dev->mtu, orig_mtu); } } } /* rtnl */ static int nh_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netdev_notifier_info_ext *info_ext; switch (event) { case NETDEV_DOWN: case NETDEV_UNREGISTER: nexthop_flush_dev(dev, event); break; case NETDEV_CHANGE: if (!(dev_get_flags(dev) & (IFF_RUNNING | IFF_LOWER_UP))) nexthop_flush_dev(dev, event); break; case NETDEV_CHANGEMTU: info_ext = ptr; nexthop_sync_mtu(dev, info_ext->ext.mtu); rt_cache_flush(dev_net(dev)); break; } return NOTIFY_DONE; } static struct notifier_block nh_netdev_notifier = { .notifier_call = nh_netdev_event, }; static int nexthops_dump(struct net *net, struct notifier_block *nb, enum nexthop_event_type event_type, struct netlink_ext_ack *extack) { struct rb_root *root = &net->nexthop.rb_root; struct rb_node *node; int err = 0; for (node = rb_first(root); node; node = rb_next(node)) { struct nexthop *nh; nh = rb_entry(node, struct nexthop, rb_node); err = call_nexthop_notifier(nb, net, event_type, nh, extack); if (err) break; } return err; } int register_nexthop_notifier(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { int err; rtnl_lock(); err = nexthops_dump(net, nb, NEXTHOP_EVENT_REPLACE, extack); if (err) goto unlock; err = blocking_notifier_chain_register(&net->nexthop.notifier_chain, nb); unlock: rtnl_unlock(); return err; } EXPORT_SYMBOL(register_nexthop_notifier); int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb) { int err; rtnl_lock(); err = blocking_notifier_chain_unregister(&net->nexthop.notifier_chain, nb); if (err) goto unlock; nexthops_dump(net, nb, NEXTHOP_EVENT_DEL, NULL); unlock: rtnl_unlock(); return err; } EXPORT_SYMBOL(unregister_nexthop_notifier); void nexthop_set_hw_flags(struct net *net, u32 id, bool offload, bool trap) { struct nexthop *nexthop; rcu_read_lock(); nexthop = nexthop_find_by_id(net, id); if (!nexthop) goto out; nexthop->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP); if (offload) nexthop->nh_flags |= RTNH_F_OFFLOAD; if (trap) nexthop->nh_flags |= RTNH_F_TRAP; out: rcu_read_unlock(); } EXPORT_SYMBOL(nexthop_set_hw_flags); void nexthop_bucket_set_hw_flags(struct net *net, u32 id, u16 bucket_index, bool offload, bool trap) { struct nh_res_table *res_table; struct nh_res_bucket *bucket; struct nexthop *nexthop; struct nh_group *nhg; rcu_read_lock(); nexthop = nexthop_find_by_id(net, id); if (!nexthop || !nexthop->is_group) goto out; nhg = rcu_dereference(nexthop->nh_grp); if (!nhg->resilient) goto out; if (bucket_index >= nhg->res_table->num_nh_buckets) goto out; res_table = rcu_dereference(nhg->res_table); bucket = &res_table->nh_buckets[bucket_index]; bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP); if (offload) bucket->nh_flags |= RTNH_F_OFFLOAD; if (trap) bucket->nh_flags |= RTNH_F_TRAP; out: rcu_read_unlock(); } EXPORT_SYMBOL(nexthop_bucket_set_hw_flags); void nexthop_res_grp_activity_update(struct net *net, u32 id, u16 num_buckets, unsigned long *activity) { struct nh_res_table *res_table; struct nexthop *nexthop; struct nh_group *nhg; u16 i; rcu_read_lock(); nexthop = nexthop_find_by_id(net, id); if (!nexthop || !nexthop->is_group) goto out; nhg = rcu_dereference(nexthop->nh_grp); if (!nhg->resilient) goto out; /* Instead of silently ignoring some buckets, demand that the sizes * be the same. */ res_table = rcu_dereference(nhg->res_table); if (num_buckets != res_table->num_nh_buckets) goto out; for (i = 0; i < num_buckets; i++) { if (test_bit(i, activity)) nh_res_bucket_set_busy(&res_table->nh_buckets[i]); } out: rcu_read_unlock(); } EXPORT_SYMBOL(nexthop_res_grp_activity_update); static void __net_exit nexthop_net_exit_batch(struct list_head *net_list) { struct net *net; rtnl_lock(); list_for_each_entry(net, net_list, exit_list) { flush_all_nexthops(net); kfree(net->nexthop.devhash); } rtnl_unlock(); } static int __net_init nexthop_net_init(struct net *net) { size_t sz = sizeof(struct hlist_head) * NH_DEV_HASHSIZE; net->nexthop.rb_root = RB_ROOT; net->nexthop.devhash = kzalloc(sz, GFP_KERNEL); if (!net->nexthop.devhash) return -ENOMEM; BLOCKING_INIT_NOTIFIER_HEAD(&net->nexthop.notifier_chain); return 0; } static struct pernet_operations nexthop_net_ops = { .init = nexthop_net_init, .exit_batch = nexthop_net_exit_batch, }; static int __init nexthop_init(void) { register_pernet_subsys(&nexthop_net_ops); register_netdevice_notifier(&nh_netdev_notifier); rtnl_register(PF_UNSPEC, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELNEXTHOP, rtm_del_nexthop, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETNEXTHOP, rtm_get_nexthop, rtm_dump_nexthop, 0); rtnl_register(PF_INET, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0); rtnl_register(PF_INET, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0); rtnl_register(PF_INET6, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0); rtnl_register(PF_INET6, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0); rtnl_register(PF_UNSPEC, RTM_GETNEXTHOPBUCKET, rtm_get_nexthop_bucket, rtm_dump_nexthop_bucket, 0); return 0; } subsys_initcall(nexthop_init); |
1362 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_RTNETLINK_H #define __LINUX_RTNETLINK_H #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/wait.h> #include <linux/refcount.h> #include <uapi/linux/rtnetlink.h> extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo); extern int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid); extern void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, const struct nlmsghdr *nlh, gfp_t flags); extern void rtnl_set_sk_err(struct net *net, u32 group, int error); extern int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics); extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, long expires, u32 error); void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, gfp_t flags, u32 portid, const struct nlmsghdr *nlh); void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change, gfp_t flags, int *new_nsid, int new_ifindex); struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, unsigned change, u32 event, gfp_t flags, int *new_nsid, int new_ifindex, u32 portid, const struct nlmsghdr *nlh); void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags, u32 portid, const struct nlmsghdr *nlh); /* RTNL is used as a global lock for all changes to network configuration */ extern void rtnl_lock(void); extern void rtnl_unlock(void); extern int rtnl_trylock(void); extern int rtnl_is_locked(void); extern int rtnl_lock_killable(void); extern bool refcount_dec_and_rtnl_lock(refcount_t *r); extern wait_queue_head_t netdev_unregistering_wq; extern struct rw_semaphore pernet_ops_rwsem; extern struct rw_semaphore net_rwsem; #ifdef CONFIG_PROVE_LOCKING extern bool lockdep_rtnl_is_held(void); #else static inline bool lockdep_rtnl_is_held(void) { return true; } #endif /* #ifdef CONFIG_PROVE_LOCKING */ /** * rcu_dereference_rtnl - rcu_dereference with debug checking * @p: The pointer to read, prior to dereferencing * * Do an rcu_dereference(p), but check caller either holds rcu_read_lock() * or RTNL. Note : Please prefer rtnl_dereference() or rcu_dereference() */ #define rcu_dereference_rtnl(p) \ rcu_dereference_check(p, lockdep_rtnl_is_held()) /** * rtnl_dereference - fetch RCU pointer when updates are prevented by RTNL * @p: The pointer to read, prior to dereferencing * * Return the value of the specified RCU-protected pointer, but omit * the READ_ONCE(), because caller holds RTNL. */ #define rtnl_dereference(p) \ rcu_dereference_protected(p, lockdep_rtnl_is_held()) static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev) { return rtnl_dereference(dev->ingress_queue); } static inline struct netdev_queue *dev_ingress_queue_rcu(struct net_device *dev) { return rcu_dereference(dev->ingress_queue); } struct netdev_queue *dev_ingress_queue_create(struct net_device *dev); #ifdef CONFIG_NET_INGRESS void net_inc_ingress_queue(void); void net_dec_ingress_queue(void); #endif #ifdef CONFIG_NET_EGRESS void net_inc_egress_queue(void); void net_dec_egress_queue(void); void netdev_xmit_skip_txqueue(bool skip); #endif void rtnetlink_init(void); void __rtnl_unlock(void); void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail); #define ASSERT_RTNL() \ WARN_ONCE(!rtnl_is_locked(), \ "RTNL: assertion failed at %s (%d)\n", __FILE__, __LINE__) extern int ndo_dflt_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, int *idx); extern int ndo_dflt_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 flags); extern int ndo_dflt_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid); extern int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev, u16 mode, u32 flags, u32 mask, int nlflags, u32 filter_mask, int (*vlan_fill)(struct sk_buff *skb, struct net_device *dev, u32 filter_mask)); extern void rtnl_offload_xstats_notify(struct net_device *dev); #endif /* __LINUX_RTNETLINK_H */ |
726 70 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SOCKET_H #define _LINUX_SOCKET_H #include <asm/socket.h> /* arch-dependent defines */ #include <linux/sockios.h> /* the SIOCxxx I/O controls */ #include <linux/uio.h> /* iovec support */ #include <linux/types.h> /* pid_t */ #include <linux/compiler.h> /* __user */ #include <uapi/linux/socket.h> struct file; struct pid; struct cred; struct socket; struct sock; struct sk_buff; #define __sockaddr_check_size(size) \ BUILD_BUG_ON(((size) > sizeof(struct __kernel_sockaddr_storage))) #ifdef CONFIG_PROC_FS struct seq_file; extern void socket_seq_show(struct seq_file *seq); #endif typedef __kernel_sa_family_t sa_family_t; /* * 1003.1g requires sa_family_t and that sa_data is char. */ struct sockaddr { sa_family_t sa_family; /* address family, AF_xxx */ union { char sa_data_min[14]; /* Minimum 14 bytes of protocol address */ DECLARE_FLEX_ARRAY(char, sa_data); }; }; struct linger { int l_onoff; /* Linger active */ int l_linger; /* How long to linger for */ }; #define sockaddr_storage __kernel_sockaddr_storage /* * As we do 4.4BSD message passing we use a 4.4BSD message passing * system, not 4.3. Thus msg_accrights(len) are now missing. They * belong in an obscure libc emulation or the bin. */ struct msghdr { void *msg_name; /* ptr to socket address structure */ int msg_namelen; /* size of socket address structure */ int msg_inq; /* output, data left in socket */ struct iov_iter msg_iter; /* data */ /* * Ancillary data. msg_control_user is the user buffer used for the * recv* side when msg_control_is_user is set, msg_control is the kernel * buffer used for all other cases. */ union { void *msg_control; void __user *msg_control_user; }; bool msg_control_is_user : 1; bool msg_get_inq : 1;/* return INQ after receive */ unsigned int msg_flags; /* flags on received message */ __kernel_size_t msg_controllen; /* ancillary data buffer length */ struct kiocb *msg_iocb; /* ptr to iocb for async requests */ struct ubuf_info *msg_ubuf; int (*sg_from_iter)(struct sock *sk, struct sk_buff *skb, struct iov_iter *from, size_t length); }; struct user_msghdr { void __user *msg_name; /* ptr to socket address structure */ int msg_namelen; /* size of socket address structure */ struct iovec __user *msg_iov; /* scatter/gather array */ __kernel_size_t msg_iovlen; /* # elements in msg_iov */ void __user *msg_control; /* ancillary data */ __kernel_size_t msg_controllen; /* ancillary data buffer length */ unsigned int msg_flags; /* flags on received message */ }; /* For recvmmsg/sendmmsg */ struct mmsghdr { struct user_msghdr msg_hdr; unsigned int msg_len; }; /* * POSIX 1003.1g - ancillary data object information * Ancillary data consists of a sequence of pairs of * (cmsghdr, cmsg_data[]) */ struct cmsghdr { __kernel_size_t cmsg_len; /* data byte count, including hdr */ int cmsg_level; /* originating protocol */ int cmsg_type; /* protocol-specific type */ }; /* * Ancillary data object information MACROS * Table 5-14 of POSIX 1003.1g */ #define __CMSG_NXTHDR(ctl, len, cmsg) __cmsg_nxthdr((ctl),(len),(cmsg)) #define CMSG_NXTHDR(mhdr, cmsg) cmsg_nxthdr((mhdr), (cmsg)) #define CMSG_ALIGN(len) ( ((len)+sizeof(long)-1) & ~(sizeof(long)-1) ) #define CMSG_DATA(cmsg) \ ((void *)(cmsg) + sizeof(struct cmsghdr)) #define CMSG_USER_DATA(cmsg) \ ((void __user *)(cmsg) + sizeof(struct cmsghdr)) #define CMSG_SPACE(len) (sizeof(struct cmsghdr) + CMSG_ALIGN(len)) #define CMSG_LEN(len) (sizeof(struct cmsghdr) + (len)) #define __CMSG_FIRSTHDR(ctl,len) ((len) >= sizeof(struct cmsghdr) ? \ (struct cmsghdr *)(ctl) : \ (struct cmsghdr *)NULL) #define CMSG_FIRSTHDR(msg) __CMSG_FIRSTHDR((msg)->msg_control, (msg)->msg_controllen) #define CMSG_OK(mhdr, cmsg) ((cmsg)->cmsg_len >= sizeof(struct cmsghdr) && \ (cmsg)->cmsg_len <= (unsigned long) \ ((mhdr)->msg_controllen - \ ((char *)(cmsg) - (char *)(mhdr)->msg_control))) #define for_each_cmsghdr(cmsg, msg) \ for (cmsg = CMSG_FIRSTHDR(msg); \ cmsg; \ cmsg = CMSG_NXTHDR(msg, cmsg)) /* * Get the next cmsg header * * PLEASE, do not touch this function. If you think, that it is * incorrect, grep kernel sources and think about consequences * before trying to improve it. * * Now it always returns valid, not truncated ancillary object * HEADER. But caller still MUST check, that cmsg->cmsg_len is * inside range, given by msg->msg_controllen before using * ancillary object DATA. --ANK (980731) */ static inline struct cmsghdr * __cmsg_nxthdr(void *__ctl, __kernel_size_t __size, struct cmsghdr *__cmsg) { struct cmsghdr * __ptr; __ptr = (struct cmsghdr*)(((unsigned char *) __cmsg) + CMSG_ALIGN(__cmsg->cmsg_len)); if ((unsigned long)((char*)(__ptr+1) - (char *) __ctl) > __size) return (struct cmsghdr *)0; return __ptr; } static inline struct cmsghdr * cmsg_nxthdr (struct msghdr *__msg, struct cmsghdr *__cmsg) { return __cmsg_nxthdr(__msg->msg_control, __msg->msg_controllen, __cmsg); } static inline size_t msg_data_left(struct msghdr *msg) { return iov_iter_count(&msg->msg_iter); } /* "Socket"-level control message types: */ #define SCM_RIGHTS 0x01 /* rw: access rights (array of int) */ #define SCM_CREDENTIALS 0x02 /* rw: struct ucred */ #define SCM_SECURITY 0x03 /* rw: security label */ #define SCM_PIDFD 0x04 /* ro: pidfd (int) */ struct ucred { __u32 pid; __u32 uid; __u32 gid; }; /* Supported address families. */ #define AF_UNSPEC 0 #define AF_UNIX 1 /* Unix domain sockets */ #define AF_LOCAL 1 /* POSIX name for AF_UNIX */ #define AF_INET 2 /* Internet IP Protocol */ #define AF_AX25 3 /* Amateur Radio AX.25 */ #define AF_IPX 4 /* Novell IPX */ #define AF_APPLETALK 5 /* AppleTalk DDP */ #define AF_NETROM 6 /* Amateur Radio NET/ROM */ #define AF_BRIDGE 7 /* Multiprotocol bridge */ #define AF_ATMPVC 8 /* ATM PVCs */ #define AF_X25 9 /* Reserved for X.25 project */ #define AF_INET6 10 /* IP version 6 */ #define AF_ROSE 11 /* Amateur Radio X.25 PLP */ #define AF_DECnet 12 /* Reserved for DECnet project */ #define AF_NETBEUI 13 /* Reserved for 802.2LLC project*/ #define AF_SECURITY 14 /* Security callback pseudo AF */ #define AF_KEY 15 /* PF_KEY key management API */ #define AF_NETLINK 16 #define AF_ROUTE AF_NETLINK /* Alias to emulate 4.4BSD */ #define AF_PACKET 17 /* Packet family */ #define AF_ASH 18 /* Ash */ #define AF_ECONET 19 /* Acorn Econet */ #define AF_ATMSVC 20 /* ATM SVCs */ #define AF_RDS 21 /* RDS sockets */ #define AF_SNA 22 /* Linux SNA Project (nutters!) */ #define AF_IRDA 23 /* IRDA sockets */ #define AF_PPPOX 24 /* PPPoX sockets */ #define AF_WANPIPE 25 /* Wanpipe API Sockets */ #define AF_LLC 26 /* Linux LLC */ #define AF_IB 27 /* Native InfiniBand address */ #define AF_MPLS 28 /* MPLS */ #define AF_CAN 29 /* Controller Area Network */ #define AF_TIPC 30 /* TIPC sockets */ #define AF_BLUETOOTH 31 /* Bluetooth sockets */ #define AF_IUCV 32 /* IUCV sockets */ #define AF_RXRPC 33 /* RxRPC sockets */ #define AF_ISDN 34 /* mISDN sockets */ #define AF_PHONET 35 /* Phonet sockets */ #define AF_IEEE802154 36 /* IEEE802154 sockets */ #define AF_CAIF 37 /* CAIF sockets */ #define AF_ALG 38 /* Algorithm sockets */ #define AF_NFC 39 /* NFC sockets */ #define AF_VSOCK 40 /* vSockets */ #define AF_KCM 41 /* Kernel Connection Multiplexor*/ #define AF_QIPCRTR 42 /* Qualcomm IPC Router */ #define AF_SMC 43 /* smc sockets: reserve number for * PF_SMC protocol family that * reuses AF_INET address family */ #define AF_XDP 44 /* XDP sockets */ #define AF_MCTP 45 /* Management component * transport protocol */ #define AF_MAX 46 /* For now.. */ /* Protocol families, same as address families. */ #define PF_UNSPEC AF_UNSPEC #define PF_UNIX AF_UNIX #define PF_LOCAL AF_LOCAL #define PF_INET AF_INET #define PF_AX25 AF_AX25 #define PF_IPX AF_IPX #define PF_APPLETALK AF_APPLETALK #define PF_NETROM AF_NETROM #define PF_BRIDGE AF_BRIDGE #define PF_ATMPVC AF_ATMPVC #define PF_X25 AF_X25 #define PF_INET6 AF_INET6 #define PF_ROSE AF_ROSE #define PF_DECnet AF_DECnet #define PF_NETBEUI AF_NETBEUI #define PF_SECURITY AF_SECURITY #define PF_KEY AF_KEY #define PF_NETLINK AF_NETLINK #define PF_ROUTE AF_ROUTE #define PF_PACKET AF_PACKET #define PF_ASH AF_ASH #define PF_ECONET AF_ECONET #define PF_ATMSVC AF_ATMSVC #define PF_RDS AF_RDS #define PF_SNA AF_SNA #define PF_IRDA AF_IRDA #define PF_PPPOX AF_PPPOX #define PF_WANPIPE AF_WANPIPE #define PF_LLC AF_LLC #define PF_IB AF_IB #define PF_MPLS AF_MPLS #define PF_CAN AF_CAN #define PF_TIPC AF_TIPC #define PF_BLUETOOTH AF_BLUETOOTH #define PF_IUCV AF_IUCV #define PF_RXRPC AF_RXRPC #define PF_ISDN AF_ISDN #define PF_PHONET AF_PHONET #define PF_IEEE802154 AF_IEEE802154 #define PF_CAIF AF_CAIF #define PF_ALG AF_ALG #define PF_NFC AF_NFC #define PF_VSOCK AF_VSOCK #define PF_KCM AF_KCM #define PF_QIPCRTR AF_QIPCRTR #define PF_SMC AF_SMC #define PF_XDP AF_XDP #define PF_MCTP AF_MCTP #define PF_MAX AF_MAX /* Maximum queue length specifiable by listen. */ #define SOMAXCONN 4096 /* Flags we can use with send/ and recv. Added those for 1003.1g not all are supported yet */ #define MSG_OOB 1 #define MSG_PEEK 2 #define MSG_DONTROUTE 4 #define MSG_TRYHARD 4 /* Synonym for MSG_DONTROUTE for DECnet */ #define MSG_CTRUNC 8 #define MSG_PROBE 0x10 /* Do not send. Only probe path f.e. for MTU */ #define MSG_TRUNC 0x20 #define MSG_DONTWAIT 0x40 /* Nonblocking io */ #define MSG_EOR 0x80 /* End of record */ #define MSG_WAITALL 0x100 /* Wait for a full request */ #define MSG_FIN 0x200 #define MSG_SYN 0x400 #define MSG_CONFIRM 0x800 /* Confirm path validity */ #define MSG_RST 0x1000 #define MSG_ERRQUEUE 0x2000 /* Fetch message from error queue */ #define MSG_NOSIGNAL 0x4000 /* Do not generate SIGPIPE */ #define MSG_MORE 0x8000 /* Sender will send more */ #define MSG_WAITFORONE 0x10000 /* recvmmsg(): block until 1+ packets avail */ #define MSG_SENDPAGE_NOPOLICY 0x10000 /* sendpage() internal : do no apply policy */ #define MSG_BATCH 0x40000 /* sendmmsg(): more messages coming */ #define MSG_EOF MSG_FIN #define MSG_NO_SHARED_FRAGS 0x80000 /* sendpage() internal : page frags are not shared */ #define MSG_SENDPAGE_DECRYPTED 0x100000 /* sendpage() internal : page may carry * plain text and require encryption */ #define MSG_ZEROCOPY 0x4000000 /* Use user data in kernel path */ #define MSG_SPLICE_PAGES 0x8000000 /* Splice the pages from the iterator in sendmsg() */ #define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */ #define MSG_CMSG_CLOEXEC 0x40000000 /* Set close_on_exec for file descriptor received through SCM_RIGHTS */ #if defined(CONFIG_COMPAT) #define MSG_CMSG_COMPAT 0x80000000 /* This message needs 32 bit fixups */ #else #define MSG_CMSG_COMPAT 0 /* We never have 32 bit fixups */ #endif /* Flags to be cleared on entry by sendmsg and sendmmsg syscalls */ #define MSG_INTERNAL_SENDMSG_FLAGS \ (MSG_SPLICE_PAGES | MSG_SENDPAGE_NOPOLICY | MSG_SENDPAGE_DECRYPTED) /* Setsockoptions(2) level. Thanks to BSD these must match IPPROTO_xxx */ #define SOL_IP 0 /* #define SOL_ICMP 1 No-no-no! Due to Linux :-) we cannot use SOL_ICMP=1 */ #define SOL_TCP 6 #define SOL_UDP 17 #define SOL_IPV6 41 #define SOL_ICMPV6 58 #define SOL_SCTP 132 #define SOL_UDPLITE 136 /* UDP-Lite (RFC 3828) */ #define SOL_RAW 255 #define SOL_IPX 256 #define SOL_AX25 257 #define SOL_ATALK 258 #define SOL_NETROM 259 #define SOL_ROSE 260 #define SOL_DECNET 261 #define SOL_X25 262 #define SOL_PACKET 263 #define SOL_ATM 264 /* ATM layer (cell level) */ #define SOL_AAL 265 /* ATM Adaption Layer (packet level) */ #define SOL_IRDA 266 #define SOL_NETBEUI 267 #define SOL_LLC 268 #define SOL_DCCP 269 #define SOL_NETLINK 270 #define SOL_TIPC 271 #define SOL_RXRPC 272 #define SOL_PPPOL2TP 273 #define SOL_BLUETOOTH 274 #define SOL_PNPIPE 275 #define SOL_RDS 276 #define SOL_IUCV 277 #define SOL_CAIF 278 #define SOL_ALG 279 #define SOL_NFC 280 #define SOL_KCM 281 #define SOL_TLS 282 #define SOL_XDP 283 #define SOL_MPTCP 284 #define SOL_MCTP 285 #define SOL_SMC 286 /* IPX options */ #define IPX_TYPE 1 extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr); extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data); struct timespec64; struct __kernel_timespec; struct old_timespec32; struct scm_timestamping_internal { struct timespec64 ts[3]; }; extern void put_cmsg_scm_timestamping64(struct msghdr *msg, struct scm_timestamping_internal *tss); extern void put_cmsg_scm_timestamping(struct msghdr *msg, struct scm_timestamping_internal *tss); /* The __sys_...msg variants allow MSG_CMSG_COMPAT iff * forbid_cmsg_compat==false */ extern long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned int flags, bool forbid_cmsg_compat); extern long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned int flags, bool forbid_cmsg_compat); extern int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, unsigned int flags, struct __kernel_timespec __user *timeout, struct old_timespec32 __user *timeout32); extern int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, unsigned int flags, bool forbid_cmsg_compat); extern long __sys_sendmsg_sock(struct socket *sock, struct msghdr *msg, unsigned int flags); extern long __sys_recvmsg_sock(struct socket *sock, struct msghdr *msg, struct user_msghdr __user *umsg, struct sockaddr __user *uaddr, unsigned int flags); extern int sendmsg_copy_msghdr(struct msghdr *msg, struct user_msghdr __user *umsg, unsigned flags, struct iovec **iov); extern int recvmsg_copy_msghdr(struct msghdr *msg, struct user_msghdr __user *umsg, unsigned flags, struct sockaddr __user **uaddr, struct iovec **iov); extern int __copy_msghdr(struct msghdr *kmsg, struct user_msghdr *umsg, struct sockaddr __user **save_addr); /* helpers which do the actual work for syscalls */ extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size, unsigned int flags, struct sockaddr __user *addr, int __user *addr_len); extern int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags, struct sockaddr __user *addr, int addr_len); extern struct file *do_accept(struct file *file, unsigned file_flags, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen, int flags); extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen, int flags); extern int __sys_socket(int family, int type, int protocol); extern struct file *__sys_socket_file(int family, int type, int protocol); extern int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen); extern int __sys_connect_file(struct file *file, struct sockaddr_storage *addr, int addrlen, int file_flags); extern int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen); extern int __sys_listen(int fd, int backlog); extern int __sys_getsockname(int fd, struct sockaddr __user *usockaddr, int __user *usockaddr_len); extern int __sys_getpeername(int fd, struct sockaddr __user *usockaddr, int __user *usockaddr_len); extern int __sys_socketpair(int family, int type, int protocol, int __user *usockvec); extern int __sys_shutdown_sock(struct socket *sock, int how); extern int __sys_shutdown(int fd, int how); #endif /* _LINUX_SOCKET_H */ |
90 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 | // SPDX-License-Identifier: GPL-2.0 /* RTT/RTO calculation. * * Adapted from TCP for AF_RXRPC by David Howells (dhowells@redhat.com) * * https://tools.ietf.org/html/rfc6298 * https://tools.ietf.org/html/rfc1122#section-4.2.3.1 * http://ccr.sigcomm.org/archive/1995/jan95/ccr-9501-partridge87.pdf */ #include <linux/net.h> #include "ar-internal.h" #define RXRPC_RTO_MAX ((unsigned)(120 * HZ)) #define RXRPC_TIMEOUT_INIT ((unsigned)(1*HZ)) /* RFC6298 2.1 initial RTO value */ #define rxrpc_jiffies32 ((u32)jiffies) /* As rxrpc_jiffies32 */ static u32 rxrpc_rto_min_us(struct rxrpc_peer *peer) { return 200; } static u32 __rxrpc_set_rto(const struct rxrpc_peer *peer) { return usecs_to_jiffies((peer->srtt_us >> 3) + peer->rttvar_us); } static u32 rxrpc_bound_rto(u32 rto) { return min(rto, RXRPC_RTO_MAX); } /* * Called to compute a smoothed rtt estimate. The data fed to this * routine either comes from timestamps, or from segments that were * known _not_ to have been retransmitted [see Karn/Partridge * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88 * piece by Van Jacobson. * NOTE: the next three routines used to be one big routine. * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ static void rxrpc_rtt_estimator(struct rxrpc_peer *peer, long sample_rtt_us) { long m = sample_rtt_us; /* RTT */ u32 srtt = peer->srtt_us; /* The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev * are scaled versions of rtt and mean deviation. * This is designed to be as fast as possible * m stands for "measurement". * * On a 1990 paper the rto value is changed to: * RTO = rtt + 4 * mdev * * Funny. This algorithm seems to be very broken. * These formulae increase RTO, when it should be decreased, increase * too slowly, when it should be increased quickly, decrease too quickly * etc. I guess in BSD RTO takes ONE value, so that it is absolutely * does not matter how to _calculate_ it. Seems, it was trap * that VJ failed to avoid. 8) */ if (srtt != 0) { m -= (srtt >> 3); /* m is now error in rtt est */ srtt += m; /* rtt = 7/8 rtt + 1/8 new */ if (m < 0) { m = -m; /* m is now abs(error) */ m -= (peer->mdev_us >> 2); /* similar update on mdev */ /* This is similar to one of Eifel findings. * Eifel blocks mdev updates when rtt decreases. * This solution is a bit different: we use finer gain * for mdev in this case (alpha*beta). * Like Eifel it also prevents growth of rto, * but also it limits too fast rto decreases, * happening in pure Eifel. */ if (m > 0) m >>= 3; } else { m -= (peer->mdev_us >> 2); /* similar update on mdev */ } peer->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */ if (peer->mdev_us > peer->mdev_max_us) { peer->mdev_max_us = peer->mdev_us; if (peer->mdev_max_us > peer->rttvar_us) peer->rttvar_us = peer->mdev_max_us; } } else { /* no previous measure. */ srtt = m << 3; /* take the measured time to be rtt */ peer->mdev_us = m << 1; /* make sure rto = 3*rtt */ peer->rttvar_us = max(peer->mdev_us, rxrpc_rto_min_us(peer)); peer->mdev_max_us = peer->rttvar_us; } peer->srtt_us = max(1U, srtt); } /* * Calculate rto without backoff. This is the second half of Van Jacobson's * routine referred to above. */ static void rxrpc_set_rto(struct rxrpc_peer *peer) { u32 rto; /* 1. If rtt variance happened to be less 50msec, it is hallucination. * It cannot be less due to utterly erratic ACK generation made * at least by solaris and freebsd. "Erratic ACKs" has _nothing_ * to do with delayed acks, because at cwnd>2 true delack timeout * is invisible. Actually, Linux-2.4 also generates erratic * ACKs in some circumstances. */ rto = __rxrpc_set_rto(peer); /* 2. Fixups made earlier cannot be right. * If we do not estimate RTO correctly without them, * all the algo is pure shit and should be replaced * with correct one. It is exactly, which we pretend to do. */ /* NOTE: clamping at RXRPC_RTO_MIN is not required, current algo * guarantees that rto is higher. */ peer->rto_j = rxrpc_bound_rto(rto); } static void rxrpc_ack_update_rtt(struct rxrpc_peer *peer, long rtt_us) { if (rtt_us < 0) return; //rxrpc_update_rtt_min(peer, rtt_us); rxrpc_rtt_estimator(peer, rtt_us); rxrpc_set_rto(peer); /* RFC6298: only reset backoff on valid RTT measurement. */ peer->backoff = 0; } /* * Add RTT information to cache. This is called in softirq mode and has * exclusive access to the peer RTT data. */ void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, int rtt_slot, rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial, ktime_t send_time, ktime_t resp_time) { struct rxrpc_peer *peer = call->peer; s64 rtt_us; rtt_us = ktime_to_us(ktime_sub(resp_time, send_time)); if (rtt_us < 0) return; spin_lock(&peer->rtt_input_lock); rxrpc_ack_update_rtt(peer, rtt_us); if (peer->rtt_count < 3) peer->rtt_count++; spin_unlock(&peer->rtt_input_lock); trace_rxrpc_rtt_rx(call, why, rtt_slot, send_serial, resp_serial, peer->srtt_us >> 3, peer->rto_j); } /* * Get the retransmission timeout to set in jiffies, backing it off each time * we retransmit. */ unsigned long rxrpc_get_rto_backoff(struct rxrpc_peer *peer, bool retrans) { u64 timo_j; u8 backoff = READ_ONCE(peer->backoff); timo_j = peer->rto_j; timo_j <<= backoff; if (retrans && timo_j * 2 <= RXRPC_RTO_MAX) WRITE_ONCE(peer->backoff, backoff + 1); if (timo_j < 1) timo_j = 1; return timo_j; } void rxrpc_peer_init_rtt(struct rxrpc_peer *peer) { peer->rto_j = RXRPC_TIMEOUT_INIT; peer->mdev_us = jiffies_to_usecs(RXRPC_TIMEOUT_INIT); peer->backoff = 0; //minmax_reset(&peer->rtt_min, rxrpc_jiffies32, ~0U); } |
1009 1009 1009 1009 1009 1009 204 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 | // SPDX-License-Identifier: GPL-2.0-only /* -*- linux-c -*- * sysctl_net.c: sysctl interface to net subsystem. * * Begun April 1, 1996, Mike Shaver. * Added /proc/sys/net directories for each protocol family. [MS] * * Revision 1.2 1996/05/08 20:24:40 shaver * Added bits for NET_BRIDGE and the NET_IPV4_ARP stuff and * NET_IPV4_IP_FORWARD. * * */ #include <linux/mm.h> #include <linux/export.h> #include <linux/sysctl.h> #include <linux/nsproxy.h> #include <net/sock.h> #ifdef CONFIG_INET #include <net/ip.h> #endif #ifdef CONFIG_NET #include <linux/if_ether.h> #endif static struct ctl_table_set * net_ctl_header_lookup(struct ctl_table_root *root) { return ¤t->nsproxy->net_ns->sysctls; } static int is_seen(struct ctl_table_set *set) { return ¤t->nsproxy->net_ns->sysctls == set; } /* Return standard mode bits for table entry. */ static int net_ctl_permissions(struct ctl_table_header *head, struct ctl_table *table) { struct net *net = container_of(head->set, struct net, sysctls); /* Allow network administrator to have same access as root. */ if (ns_capable_noaudit(net->user_ns, CAP_NET_ADMIN)) { int mode = (table->mode >> 6) & 7; return (mode << 6) | (mode << 3) | mode; } return table->mode; } static void net_ctl_set_ownership(struct ctl_table_header *head, struct ctl_table *table, kuid_t *uid, kgid_t *gid) { struct net *net = container_of(head->set, struct net, sysctls); kuid_t ns_root_uid; kgid_t ns_root_gid; ns_root_uid = make_kuid(net->user_ns, 0); if (uid_valid(ns_root_uid)) *uid = ns_root_uid; ns_root_gid = make_kgid(net->user_ns, 0); if (gid_valid(ns_root_gid)) *gid = ns_root_gid; } static struct ctl_table_root net_sysctl_root = { .lookup = net_ctl_header_lookup, .permissions = net_ctl_permissions, .set_ownership = net_ctl_set_ownership, }; static int __net_init sysctl_net_init(struct net *net) { setup_sysctl_set(&net->sysctls, &net_sysctl_root, is_seen); return 0; } static void __net_exit sysctl_net_exit(struct net *net) { retire_sysctl_set(&net->sysctls); } static struct pernet_operations sysctl_pernet_ops = { .init = sysctl_net_init, .exit = sysctl_net_exit, }; static struct ctl_table_header *net_header; __init int net_sysctl_init(void) { static struct ctl_table empty[1]; int ret = -ENOMEM; /* Avoid limitations in the sysctl implementation by * registering "/proc/sys/net" as an empty directory not in a * network namespace. */ net_header = register_sysctl_sz("net", empty, 0); if (!net_header) goto out; ret = register_pernet_subsys(&sysctl_pernet_ops); if (ret) goto out1; out: return ret; out1: unregister_sysctl_table(net_header); net_header = NULL; goto out; } /* Verify that sysctls for non-init netns are safe by either: * 1) being read-only, or * 2) having a data pointer which points outside of the global kernel/module * data segment, and rather into the heap where a per-net object was * allocated. */ static void ensure_safe_net_sysctl(struct net *net, const char *path, struct ctl_table *table, size_t table_size) { struct ctl_table *ent; pr_debug("Registering net sysctl (net %p): %s\n", net, path); ent = table; for (size_t i = 0; i < table_size && ent->procname; ent++, i++) { unsigned long addr; const char *where; pr_debug(" procname=%s mode=%o proc_handler=%ps data=%p\n", ent->procname, ent->mode, ent->proc_handler, ent->data); /* If it's not writable inside the netns, then it can't hurt. */ if ((ent->mode & 0222) == 0) { pr_debug(" Not writable by anyone\n"); continue; } /* Where does data point? */ addr = (unsigned long)ent->data; if (is_module_address(addr)) where = "module"; else if (is_kernel_core_data(addr)) where = "kernel"; else continue; /* If it is writable and points to kernel/module global * data, then it's probably a netns leak. */ WARN(1, "sysctl %s/%s: data points to %s global data: %ps\n", path, ent->procname, where, ent->data); /* Make it "safe" by dropping writable perms */ ent->mode &= ~0222; } } struct ctl_table_header *register_net_sysctl_sz(struct net *net, const char *path, struct ctl_table *table, size_t table_size) { int count; struct ctl_table *entry; if (!net_eq(net, &init_net)) ensure_safe_net_sysctl(net, path, table, table_size); entry = table; for (count = 0 ; count < table_size && entry->procname; entry++, count++) ; return __register_sysctl_table(&net->sysctls, path, table, count); } EXPORT_SYMBOL_GPL(register_net_sysctl_sz); void unregister_net_sysctl_table(struct ctl_table_header *header) { unregister_sysctl_table(header); } EXPORT_SYMBOL_GPL(unregister_net_sysctl_table); |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 | /* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ /* * include/uapi/linux/tipc.h: Header for TIPC socket interface * * Copyright (c) 2003-2006, 2015-2016 Ericsson AB * Copyright (c) 2005, 2010-2011, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _LINUX_TIPC_H_ #define _LINUX_TIPC_H_ #include <linux/types.h> #include <linux/sockios.h> /* * TIPC addressing primitives */ struct tipc_socket_addr { __u32 ref; __u32 node; }; struct tipc_service_addr { __u32 type; __u32 instance; }; struct tipc_service_range { __u32 type; __u32 lower; __u32 upper; }; /* * Application-accessible service types */ #define TIPC_NODE_STATE 0 /* node state service type */ #define TIPC_TOP_SRV 1 /* topology server service type */ #define TIPC_LINK_STATE 2 /* link state service type */ #define TIPC_RESERVED_TYPES 64 /* lowest user-allowed service type */ /* * Publication scopes when binding service / service range */ enum tipc_scope { TIPC_CLUSTER_SCOPE = 2, /* 0 can also be used */ TIPC_NODE_SCOPE = 3 }; /* * Limiting values for messages */ #define TIPC_MAX_USER_MSG_SIZE 66000U /* * Message importance levels */ #define TIPC_LOW_IMPORTANCE 0 #define TIPC_MEDIUM_IMPORTANCE 1 #define TIPC_HIGH_IMPORTANCE 2 #define TIPC_CRITICAL_IMPORTANCE 3 /* * Msg rejection/connection shutdown reasons */ #define TIPC_OK 0 #define TIPC_ERR_NO_NAME 1 #define TIPC_ERR_NO_PORT 2 #define TIPC_ERR_NO_NODE 3 #define TIPC_ERR_OVERLOAD 4 #define TIPC_CONN_SHUTDOWN 5 /* * TIPC topology subscription service definitions */ #define TIPC_SUB_PORTS 0x01 /* filter: evt at each match */ #define TIPC_SUB_SERVICE 0x02 /* filter: evt at first up/last down */ #define TIPC_SUB_CANCEL 0x04 /* filter: cancel a subscription */ #define TIPC_WAIT_FOREVER (~0) /* timeout for permanent subscription */ struct tipc_subscr { struct tipc_service_range seq; /* range of interest */ __u32 timeout; /* subscription duration (in ms) */ __u32 filter; /* bitmask of filter options */ char usr_handle[8]; /* available for subscriber use */ }; #define TIPC_PUBLISHED 1 /* publication event */ #define TIPC_WITHDRAWN 2 /* withdrawal event */ #define TIPC_SUBSCR_TIMEOUT 3 /* subscription timeout event */ struct tipc_event { __u32 event; /* event type */ __u32 found_lower; /* matching range */ __u32 found_upper; /* " " */ struct tipc_socket_addr port; /* associated socket */ struct tipc_subscr s; /* associated subscription */ }; /* * Socket API */ #ifndef AF_TIPC #define AF_TIPC 30 #endif #ifndef PF_TIPC #define PF_TIPC AF_TIPC #endif #ifndef SOL_TIPC #define SOL_TIPC 271 #endif #define TIPC_ADDR_MCAST 1 #define TIPC_SERVICE_RANGE 1 #define TIPC_SERVICE_ADDR 2 #define TIPC_SOCKET_ADDR 3 struct sockaddr_tipc { unsigned short family; unsigned char addrtype; signed char scope; union { struct tipc_socket_addr id; struct tipc_service_range nameseq; struct { struct tipc_service_addr name; __u32 domain; } name; } addr; }; /* * Ancillary data objects supported by recvmsg() */ #define TIPC_ERRINFO 1 /* error info */ #define TIPC_RETDATA 2 /* returned data */ #define TIPC_DESTNAME 3 /* destination name */ /* * TIPC-specific socket option names */ #define TIPC_IMPORTANCE 127 /* Default: TIPC_LOW_IMPORTANCE */ #define TIPC_SRC_DROPPABLE 128 /* Default: based on socket type */ #define TIPC_DEST_DROPPABLE 129 /* Default: based on socket type */ #define TIPC_CONN_TIMEOUT 130 /* Default: 8000 (ms) */ #define TIPC_NODE_RECVQ_DEPTH 131 /* Default: none (read only) */ #define TIPC_SOCK_RECVQ_DEPTH 132 /* Default: none (read only) */ #define TIPC_MCAST_BROADCAST 133 /* Default: TIPC selects. No arg */ #define TIPC_MCAST_REPLICAST 134 /* Default: TIPC selects. No arg */ #define TIPC_GROUP_JOIN 135 /* Takes struct tipc_group_req* */ #define TIPC_GROUP_LEAVE 136 /* No argument */ #define TIPC_SOCK_RECVQ_USED 137 /* Default: none (read only) */ #define TIPC_NODELAY 138 /* Default: false */ /* * Flag values */ #define TIPC_GROUP_LOOPBACK 0x1 /* Receive copy of sent msg when match */ #define TIPC_GROUP_MEMBER_EVTS 0x2 /* Receive membership events in socket */ struct tipc_group_req { __u32 type; /* group id */ __u32 instance; /* member id */ __u32 scope; /* cluster/node */ __u32 flags; }; /* * Maximum sizes of TIPC bearer-related names (including terminating NULL) * The string formatting for each name element is: * media: media * interface: media:interface name * link: node:interface-node:interface */ #define TIPC_NODEID_LEN 16 #define TIPC_MAX_MEDIA_NAME 16 #define TIPC_MAX_IF_NAME 16 #define TIPC_MAX_BEARER_NAME 32 #define TIPC_MAX_LINK_NAME 68 #define SIOCGETLINKNAME SIOCPROTOPRIVATE #define SIOCGETNODEID (SIOCPROTOPRIVATE + 1) struct tipc_sioc_ln_req { __u32 peer; __u32 bearer_id; char linkname[TIPC_MAX_LINK_NAME]; }; struct tipc_sioc_nodeid_req { __u32 peer; char node_id[TIPC_NODEID_LEN]; }; /* * TIPC Crypto, AEAD */ #define TIPC_AEAD_ALG_NAME (32) struct tipc_aead_key { char alg_name[TIPC_AEAD_ALG_NAME]; unsigned int keylen; /* in bytes */ char key[]; }; #define TIPC_AEAD_KEYLEN_MIN (16 + 4) #define TIPC_AEAD_KEYLEN_MAX (32 + 4) #define TIPC_AEAD_KEY_SIZE_MAX (sizeof(struct tipc_aead_key) + \ TIPC_AEAD_KEYLEN_MAX) static inline int tipc_aead_key_size(struct tipc_aead_key *key) { return sizeof(*key) + key->keylen; } #define TIPC_REKEYING_NOW (~0U) /* The macros and functions below are deprecated: */ #define TIPC_CFG_SRV 0 #define TIPC_ZONE_SCOPE 1 #define TIPC_ADDR_NAMESEQ 1 #define TIPC_ADDR_NAME 2 #define TIPC_ADDR_ID 3 #define TIPC_NODE_BITS 12 #define TIPC_CLUSTER_BITS 12 #define TIPC_ZONE_BITS 8 #define TIPC_NODE_OFFSET 0 #define TIPC_CLUSTER_OFFSET TIPC_NODE_BITS #define TIPC_ZONE_OFFSET (TIPC_CLUSTER_OFFSET + TIPC_CLUSTER_BITS) #define TIPC_NODE_SIZE ((1UL << TIPC_NODE_BITS) - 1) #define TIPC_CLUSTER_SIZE ((1UL << TIPC_CLUSTER_BITS) - 1) #define TIPC_ZONE_SIZE ((1UL << TIPC_ZONE_BITS) - 1) #define TIPC_NODE_MASK (TIPC_NODE_SIZE << TIPC_NODE_OFFSET) #define TIPC_CLUSTER_MASK (TIPC_CLUSTER_SIZE << TIPC_CLUSTER_OFFSET) #define TIPC_ZONE_MASK (TIPC_ZONE_SIZE << TIPC_ZONE_OFFSET) #define TIPC_ZONE_CLUSTER_MASK (TIPC_ZONE_MASK | TIPC_CLUSTER_MASK) #define tipc_portid tipc_socket_addr #define tipc_name tipc_service_addr #define tipc_name_seq tipc_service_range static inline __u32 tipc_addr(unsigned int zone, unsigned int cluster, unsigned int node) { return (zone << TIPC_ZONE_OFFSET) | (cluster << TIPC_CLUSTER_OFFSET) | node; } static inline unsigned int tipc_zone(__u32 addr) { return addr >> TIPC_ZONE_OFFSET; } static inline unsigned int tipc_cluster(__u32 addr) { return (addr & TIPC_CLUSTER_MASK) >> TIPC_CLUSTER_OFFSET; } static inline unsigned int tipc_node(__u32 addr) { return addr & TIPC_NODE_MASK; } #endif |
2 2 2 2 2 2 2 2 2 1 1 2 2 2 2 2 2 1 2 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 | // SPDX-License-Identifier: GPL-2.0-only #include "netlink.h" #include "common.h" #include "bitset.h" /* LINKMODES_GET */ struct linkmodes_req_info { struct ethnl_req_info base; }; struct linkmodes_reply_data { struct ethnl_reply_data base; struct ethtool_link_ksettings ksettings; struct ethtool_link_settings *lsettings; bool peer_empty; }; #define LINKMODES_REPDATA(__reply_base) \ container_of(__reply_base, struct linkmodes_reply_data, base) const struct nla_policy ethnl_linkmodes_get_policy[] = { [ETHTOOL_A_LINKMODES_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int linkmodes_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct linkmodes_reply_data *data = LINKMODES_REPDATA(reply_base); struct net_device *dev = reply_base->dev; int ret; data->lsettings = &data->ksettings.base; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; ret = __ethtool_get_link_ksettings(dev, &data->ksettings); if (ret < 0 && info) { GENL_SET_ERR_MSG(info, "failed to retrieve link settings"); goto out; } if (!dev->ethtool_ops->cap_link_lanes_supported) data->ksettings.lanes = 0; data->peer_empty = bitmap_empty(data->ksettings.link_modes.lp_advertising, __ETHTOOL_LINK_MODE_MASK_NBITS); out: ethnl_ops_complete(dev); return ret; } static int linkmodes_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct linkmodes_reply_data *data = LINKMODES_REPDATA(reply_base); const struct ethtool_link_ksettings *ksettings = &data->ksettings; const struct ethtool_link_settings *lsettings = &ksettings->base; bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; int len, ret; len = nla_total_size(sizeof(u8)) /* LINKMODES_AUTONEG */ + nla_total_size(sizeof(u32)) /* LINKMODES_SPEED */ + nla_total_size(sizeof(u32)) /* LINKMODES_LANES */ + nla_total_size(sizeof(u8)) /* LINKMODES_DUPLEX */ + nla_total_size(sizeof(u8)) /* LINKMODES_RATE_MATCHING */ + 0; ret = ethnl_bitset_size(ksettings->link_modes.advertising, ksettings->link_modes.supported, __ETHTOOL_LINK_MODE_MASK_NBITS, link_mode_names, compact); if (ret < 0) return ret; len += ret; if (!data->peer_empty) { ret = ethnl_bitset_size(ksettings->link_modes.lp_advertising, NULL, __ETHTOOL_LINK_MODE_MASK_NBITS, link_mode_names, compact); if (ret < 0) return ret; len += ret; } if (lsettings->master_slave_cfg != MASTER_SLAVE_CFG_UNSUPPORTED) len += nla_total_size(sizeof(u8)); if (lsettings->master_slave_state != MASTER_SLAVE_STATE_UNSUPPORTED) len += nla_total_size(sizeof(u8)); return len; } static int linkmodes_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct linkmodes_reply_data *data = LINKMODES_REPDATA(reply_base); const struct ethtool_link_ksettings *ksettings = &data->ksettings; const struct ethtool_link_settings *lsettings = &ksettings->base; bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; int ret; if (nla_put_u8(skb, ETHTOOL_A_LINKMODES_AUTONEG, lsettings->autoneg)) return -EMSGSIZE; ret = ethnl_put_bitset(skb, ETHTOOL_A_LINKMODES_OURS, ksettings->link_modes.advertising, ksettings->link_modes.supported, __ETHTOOL_LINK_MODE_MASK_NBITS, link_mode_names, compact); if (ret < 0) return -EMSGSIZE; if (!data->peer_empty) { ret = ethnl_put_bitset(skb, ETHTOOL_A_LINKMODES_PEER, ksettings->link_modes.lp_advertising, NULL, __ETHTOOL_LINK_MODE_MASK_NBITS, link_mode_names, compact); if (ret < 0) return -EMSGSIZE; } if (nla_put_u32(skb, ETHTOOL_A_LINKMODES_SPEED, lsettings->speed) || nla_put_u8(skb, ETHTOOL_A_LINKMODES_DUPLEX, lsettings->duplex)) return -EMSGSIZE; if (ksettings->lanes && nla_put_u32(skb, ETHTOOL_A_LINKMODES_LANES, ksettings->lanes)) return -EMSGSIZE; if (lsettings->master_slave_cfg != MASTER_SLAVE_CFG_UNSUPPORTED && nla_put_u8(skb, ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG, lsettings->master_slave_cfg)) return -EMSGSIZE; if (lsettings->master_slave_state != MASTER_SLAVE_STATE_UNSUPPORTED && nla_put_u8(skb, ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE, lsettings->master_slave_state)) return -EMSGSIZE; if (nla_put_u8(skb, ETHTOOL_A_LINKMODES_RATE_MATCHING, lsettings->rate_matching)) return -EMSGSIZE; return 0; } /* LINKMODES_SET */ const struct nla_policy ethnl_linkmodes_set_policy[] = { [ETHTOOL_A_LINKMODES_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_LINKMODES_AUTONEG] = { .type = NLA_U8 }, [ETHTOOL_A_LINKMODES_OURS] = { .type = NLA_NESTED }, [ETHTOOL_A_LINKMODES_SPEED] = { .type = NLA_U32 }, [ETHTOOL_A_LINKMODES_DUPLEX] = { .type = NLA_U8 }, [ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG] = { .type = NLA_U8 }, [ETHTOOL_A_LINKMODES_LANES] = NLA_POLICY_RANGE(NLA_U32, 1, 8), }; /* Set advertised link modes to all supported modes matching requested speed, * lanes and duplex values. Called when autonegotiation is on, speed, lanes or * duplex is requested but no link mode change. This is done in userspace with * ioctl() interface, move it into kernel for netlink. * Returns true if advertised modes bitmap was modified. */ static bool ethnl_auto_linkmodes(struct ethtool_link_ksettings *ksettings, bool req_speed, bool req_lanes, bool req_duplex) { unsigned long *advertising = ksettings->link_modes.advertising; unsigned long *supported = ksettings->link_modes.supported; DECLARE_BITMAP(old_adv, __ETHTOOL_LINK_MODE_MASK_NBITS); unsigned int i; bitmap_copy(old_adv, advertising, __ETHTOOL_LINK_MODE_MASK_NBITS); for (i = 0; i < __ETHTOOL_LINK_MODE_MASK_NBITS; i++) { const struct link_mode_info *info = &link_mode_params[i]; if (info->speed == SPEED_UNKNOWN) continue; if (test_bit(i, supported) && (!req_speed || info->speed == ksettings->base.speed) && (!req_lanes || info->lanes == ksettings->lanes) && (!req_duplex || info->duplex == ksettings->base.duplex)) set_bit(i, advertising); else clear_bit(i, advertising); } return !bitmap_equal(old_adv, advertising, __ETHTOOL_LINK_MODE_MASK_NBITS); } static bool ethnl_validate_master_slave_cfg(u8 cfg) { switch (cfg) { case MASTER_SLAVE_CFG_MASTER_PREFERRED: case MASTER_SLAVE_CFG_SLAVE_PREFERRED: case MASTER_SLAVE_CFG_MASTER_FORCE: case MASTER_SLAVE_CFG_SLAVE_FORCE: return true; } return false; } static int ethnl_check_linkmodes(struct genl_info *info, struct nlattr **tb) { const struct nlattr *master_slave_cfg, *lanes_cfg; master_slave_cfg = tb[ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG]; if (master_slave_cfg && !ethnl_validate_master_slave_cfg(nla_get_u8(master_slave_cfg))) { NL_SET_ERR_MSG_ATTR(info->extack, master_slave_cfg, "master/slave value is invalid"); return -EOPNOTSUPP; } lanes_cfg = tb[ETHTOOL_A_LINKMODES_LANES]; if (lanes_cfg && !is_power_of_2(nla_get_u32(lanes_cfg))) { NL_SET_ERR_MSG_ATTR(info->extack, lanes_cfg, "lanes value is invalid"); return -EINVAL; } return 0; } static int ethnl_update_linkmodes(struct genl_info *info, struct nlattr **tb, struct ethtool_link_ksettings *ksettings, bool *mod, const struct net_device *dev) { struct ethtool_link_settings *lsettings = &ksettings->base; bool req_speed, req_lanes, req_duplex; const struct nlattr *master_slave_cfg, *lanes_cfg; int ret; master_slave_cfg = tb[ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG]; if (master_slave_cfg) { if (lsettings->master_slave_cfg == MASTER_SLAVE_CFG_UNSUPPORTED) { NL_SET_ERR_MSG_ATTR(info->extack, master_slave_cfg, "master/slave configuration not supported by device"); return -EOPNOTSUPP; } } *mod = false; req_speed = tb[ETHTOOL_A_LINKMODES_SPEED]; req_lanes = tb[ETHTOOL_A_LINKMODES_LANES]; req_duplex = tb[ETHTOOL_A_LINKMODES_DUPLEX]; ethnl_update_u8(&lsettings->autoneg, tb[ETHTOOL_A_LINKMODES_AUTONEG], mod); lanes_cfg = tb[ETHTOOL_A_LINKMODES_LANES]; if (lanes_cfg) { /* If autoneg is off and lanes parameter is not supported by the * driver, return an error. */ if (!lsettings->autoneg && !dev->ethtool_ops->cap_link_lanes_supported) { NL_SET_ERR_MSG_ATTR(info->extack, lanes_cfg, "lanes configuration not supported by device"); return -EOPNOTSUPP; } } else if (!lsettings->autoneg && ksettings->lanes) { /* If autoneg is off and lanes parameter is not passed from user but * it was defined previously then set the lanes parameter to 0. */ ksettings->lanes = 0; *mod = true; } ret = ethnl_update_bitset(ksettings->link_modes.advertising, __ETHTOOL_LINK_MODE_MASK_NBITS, tb[ETHTOOL_A_LINKMODES_OURS], link_mode_names, info->extack, mod); if (ret < 0) return ret; ethnl_update_u32(&lsettings->speed, tb[ETHTOOL_A_LINKMODES_SPEED], mod); ethnl_update_u32(&ksettings->lanes, lanes_cfg, mod); ethnl_update_u8(&lsettings->duplex, tb[ETHTOOL_A_LINKMODES_DUPLEX], mod); ethnl_update_u8(&lsettings->master_slave_cfg, master_slave_cfg, mod); if (!tb[ETHTOOL_A_LINKMODES_OURS] && lsettings->autoneg && (req_speed || req_lanes || req_duplex) && ethnl_auto_linkmodes(ksettings, req_speed, req_lanes, req_duplex)) *mod = true; return 0; } static int ethnl_set_linkmodes_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; int ret; ret = ethnl_check_linkmodes(info, info->attrs); if (ret < 0) return ret; if (!ops->get_link_ksettings || !ops->set_link_ksettings) return -EOPNOTSUPP; return 1; } static int ethnl_set_linkmodes(struct ethnl_req_info *req_info, struct genl_info *info) { struct ethtool_link_ksettings ksettings = {}; struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; bool mod = false; int ret; ret = __ethtool_get_link_ksettings(dev, &ksettings); if (ret < 0) { GENL_SET_ERR_MSG(info, "failed to retrieve link settings"); return ret; } ret = ethnl_update_linkmodes(info, tb, &ksettings, &mod, dev); if (ret < 0) return ret; if (!mod) return 0; ret = dev->ethtool_ops->set_link_ksettings(dev, &ksettings); if (ret < 0) { GENL_SET_ERR_MSG(info, "link settings update failed"); return ret; } return 1; } const struct ethnl_request_ops ethnl_linkmodes_request_ops = { .request_cmd = ETHTOOL_MSG_LINKMODES_GET, .reply_cmd = ETHTOOL_MSG_LINKMODES_GET_REPLY, .hdr_attr = ETHTOOL_A_LINKMODES_HEADER, .req_info_size = sizeof(struct linkmodes_req_info), .reply_data_size = sizeof(struct linkmodes_reply_data), .prepare_data = linkmodes_prepare_data, .reply_size = linkmodes_reply_size, .fill_reply = linkmodes_fill_reply, .set_validate = ethnl_set_linkmodes_validate, .set = ethnl_set_linkmodes, .set_ntf_cmd = ETHTOOL_MSG_LINKMODES_NTF, }; |
7 7 7 6 83 4 2 7 5 7 2 1 1 2 3 35 35 35 35 35 35 35 35 25 35 35 35 35 35 35 35 35 7 5 4 3 2 1 1 1 1 1 1 1 1 1 7 5 4 3 2 2 1 1 1 1 1 1 1 1 1 1 5 35 35 35 35 35 35 35 35 37 35 37 35 35 35 35 35 35 35 37 8 8 6 5 5 3 2 2 1 2 1 4 2 2 2 2 2 2 2 2 2 2 1 1 4 4 3 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 | // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/sch_api.c Packet scheduler API. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * Fixes: * * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired. * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/kmod.h> #include <linux/list.h> #include <linux/hrtimer.h> #include <linux/slab.h> #include <linux/hashtable.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/tc_wrapper.h> #include <trace/events/qdisc.h> /* Short review. ------------- This file consists of two interrelated parts: 1. queueing disciplines manager frontend. 2. traffic classes manager frontend. Generally, queueing discipline ("qdisc") is a black box, which is able to enqueue packets and to dequeue them (when device is ready to send something) in order and at times determined by algorithm hidden in it. qdisc's are divided to two categories: - "queues", which have no internal structure visible from outside. - "schedulers", which split all the packets to "traffic classes", using "packet classifiers" (look at cls_api.c) In turn, classes may have child qdiscs (as rule, queues) attached to them etc. etc. etc. The goal of the routines in this file is to translate information supplied by user in the form of handles to more intelligible for kernel form, to make some sanity checks and part of work, which is common to all qdiscs and to provide rtnetlink notifications. All real intelligent work is done inside qdisc modules. Every discipline has two major routines: enqueue and dequeue. ---dequeue dequeue usually returns a skb to send. It is allowed to return NULL, but it does not mean that queue is empty, it just means that discipline does not want to send anything this time. Queue is really empty if q->q.qlen == 0. For complicated disciplines with multiple queues q->q is not real packet queue, but however q->q.qlen must be valid. ---enqueue enqueue returns 0, if packet was enqueued successfully. If packet (this one or another one) was dropped, it returns not zero error code. NET_XMIT_DROP - this packet dropped Expected action: do not backoff, but wait until queue will clear. NET_XMIT_CN - probably this packet enqueued, but another one dropped. Expected action: backoff or ignore Auxiliary routines: ---peek like dequeue but without removing a packet from the queue ---reset returns qdisc to initial state: purge all buffers, clear all timers, counters (except for statistics) etc. ---init initializes newly created qdisc. ---destroy destroys resources allocated by init and during lifetime of qdisc. ---change changes qdisc parameters. */ /* Protects list of registered TC modules. It is pure SMP lock. */ static DEFINE_RWLOCK(qdisc_mod_lock); /************************************************ * Queueing disciplines manipulation. * ************************************************/ /* The list of all installed queueing disciplines. */ static struct Qdisc_ops *qdisc_base; /* Register/unregister queueing discipline */ int register_qdisc(struct Qdisc_ops *qops) { struct Qdisc_ops *q, **qp; int rc = -EEXIST; write_lock(&qdisc_mod_lock); for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next) if (!strcmp(qops->id, q->id)) goto out; if (qops->enqueue == NULL) qops->enqueue = noop_qdisc_ops.enqueue; if (qops->peek == NULL) { if (qops->dequeue == NULL) qops->peek = noop_qdisc_ops.peek; else goto out_einval; } if (qops->dequeue == NULL) qops->dequeue = noop_qdisc_ops.dequeue; if (qops->cl_ops) { const struct Qdisc_class_ops *cops = qops->cl_ops; if (!(cops->find && cops->walk && cops->leaf)) goto out_einval; if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf)) goto out_einval; } qops->next = NULL; *qp = qops; rc = 0; out: write_unlock(&qdisc_mod_lock); return rc; out_einval: rc = -EINVAL; goto out; } EXPORT_SYMBOL(register_qdisc); void unregister_qdisc(struct Qdisc_ops *qops) { struct Qdisc_ops *q, **qp; int err = -ENOENT; write_lock(&qdisc_mod_lock); for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next) if (q == qops) break; if (q) { *qp = q->next; q->next = NULL; err = 0; } write_unlock(&qdisc_mod_lock); WARN(err, "unregister qdisc(%s) failed\n", qops->id); } EXPORT_SYMBOL(unregister_qdisc); /* Get default qdisc if not otherwise specified */ void qdisc_get_default(char *name, size_t len) { read_lock(&qdisc_mod_lock); strscpy(name, default_qdisc_ops->id, len); read_unlock(&qdisc_mod_lock); } static struct Qdisc_ops *qdisc_lookup_default(const char *name) { struct Qdisc_ops *q = NULL; for (q = qdisc_base; q; q = q->next) { if (!strcmp(name, q->id)) { if (!try_module_get(q->owner)) q = NULL; break; } } return q; } /* Set new default qdisc to use */ int qdisc_set_default(const char *name) { const struct Qdisc_ops *ops; if (!capable(CAP_NET_ADMIN)) return -EPERM; write_lock(&qdisc_mod_lock); ops = qdisc_lookup_default(name); if (!ops) { /* Not found, drop lock and try to load module */ write_unlock(&qdisc_mod_lock); request_module("sch_%s", name); write_lock(&qdisc_mod_lock); ops = qdisc_lookup_default(name); } if (ops) { /* Set new default */ module_put(default_qdisc_ops->owner); default_qdisc_ops = ops; } write_unlock(&qdisc_mod_lock); return ops ? 0 : -ENOENT; } #ifdef CONFIG_NET_SCH_DEFAULT /* Set default value from kernel config */ static int __init sch_default_qdisc(void) { return qdisc_set_default(CONFIG_DEFAULT_NET_SCH); } late_initcall(sch_default_qdisc); #endif /* We know handle. Find qdisc among all qdisc's attached to device * (root qdisc, all its children, children of children etc.) * Note: caller either uses rtnl or rcu_read_lock() */ static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle) { struct Qdisc *q; if (!qdisc_dev(root)) return (root->handle == handle ? root : NULL); if (!(root->flags & TCQ_F_BUILTIN) && root->handle == handle) return root; hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle, lockdep_rtnl_is_held()) { if (q->handle == handle) return q; } return NULL; } void qdisc_hash_add(struct Qdisc *q, bool invisible) { if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) { ASSERT_RTNL(); hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle); if (invisible) q->flags |= TCQ_F_INVISIBLE; } } EXPORT_SYMBOL(qdisc_hash_add); void qdisc_hash_del(struct Qdisc *q) { if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) { ASSERT_RTNL(); hash_del_rcu(&q->hash); } } EXPORT_SYMBOL(qdisc_hash_del); struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) { struct Qdisc *q; if (!handle) return NULL; q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle); if (q) goto out; if (dev_ingress_queue(dev)) q = qdisc_match_from_root( rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping), handle); out: return q; } struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle) { struct netdev_queue *nq; struct Qdisc *q; if (!handle) return NULL; q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle); if (q) goto out; nq = dev_ingress_queue_rcu(dev); if (nq) q = qdisc_match_from_root(rcu_dereference(nq->qdisc_sleeping), handle); out: return q; } static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid) { unsigned long cl; const struct Qdisc_class_ops *cops = p->ops->cl_ops; if (cops == NULL) return NULL; cl = cops->find(p, classid); if (cl == 0) return NULL; return cops->leaf(p, cl); } /* Find queueing discipline by name */ static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind) { struct Qdisc_ops *q = NULL; if (kind) { read_lock(&qdisc_mod_lock); for (q = qdisc_base; q; q = q->next) { if (nla_strcmp(kind, q->id) == 0) { if (!try_module_get(q->owner)) q = NULL; break; } } read_unlock(&qdisc_mod_lock); } return q; } /* The linklayer setting were not transferred from iproute2, in older * versions, and the rate tables lookup systems have been dropped in * the kernel. To keep backward compatible with older iproute2 tc * utils, we detect the linklayer setting by detecting if the rate * table were modified. * * For linklayer ATM table entries, the rate table will be aligned to * 48 bytes, thus some table entries will contain the same value. The * mpu (min packet unit) is also encoded into the old rate table, thus * starting from the mpu, we find low and high table entries for * mapping this cell. If these entries contain the same value, when * the rate tables have been modified for linklayer ATM. * * This is done by rounding mpu to the nearest 48 bytes cell/entry, * and then roundup to the next cell, calc the table entry one below, * and compare. */ static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab) { int low = roundup(r->mpu, 48); int high = roundup(low+1, 48); int cell_low = low >> r->cell_log; int cell_high = (high >> r->cell_log) - 1; /* rtab is too inaccurate at rates > 100Mbit/s */ if ((r->rate > (100000000/8)) || (rtab[0] == 0)) { pr_debug("TC linklayer: Giving up ATM detection\n"); return TC_LINKLAYER_ETHERNET; } if ((cell_high > cell_low) && (cell_high < 256) && (rtab[cell_low] == rtab[cell_high])) { pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n", cell_low, cell_high, rtab[cell_high]); return TC_LINKLAYER_ATM; } return TC_LINKLAYER_ETHERNET; } static struct qdisc_rate_table *qdisc_rtab_list; struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab, struct netlink_ext_ack *extack) { struct qdisc_rate_table *rtab; if (tab == NULL || r->rate == 0 || r->cell_log == 0 || r->cell_log >= 32 || nla_len(tab) != TC_RTAB_SIZE) { NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching"); return NULL; } for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) { if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) && !memcmp(&rtab->data, nla_data(tab), 1024)) { rtab->refcnt++; return rtab; } } rtab = kmalloc(sizeof(*rtab), GFP_KERNEL); if (rtab) { rtab->rate = *r; rtab->refcnt = 1; memcpy(rtab->data, nla_data(tab), 1024); if (r->linklayer == TC_LINKLAYER_UNAWARE) r->linklayer = __detect_linklayer(r, rtab->data); rtab->next = qdisc_rtab_list; qdisc_rtab_list = rtab; } else { NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table"); } return rtab; } EXPORT_SYMBOL(qdisc_get_rtab); void qdisc_put_rtab(struct qdisc_rate_table *tab) { struct qdisc_rate_table *rtab, **rtabp; if (!tab || --tab->refcnt) return; for (rtabp = &qdisc_rtab_list; (rtab = *rtabp) != NULL; rtabp = &rtab->next) { if (rtab == tab) { *rtabp = rtab->next; kfree(rtab); return; } } } EXPORT_SYMBOL(qdisc_put_rtab); static LIST_HEAD(qdisc_stab_list); static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = { [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) }, [TCA_STAB_DATA] = { .type = NLA_BINARY }, }; static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_STAB_MAX + 1]; struct qdisc_size_table *stab; struct tc_sizespec *s; unsigned int tsize = 0; u16 *tab = NULL; int err; err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy, extack); if (err < 0) return ERR_PTR(err); if (!tb[TCA_STAB_BASE]) { NL_SET_ERR_MSG(extack, "Size table base attribute is missing"); return ERR_PTR(-EINVAL); } s = nla_data(tb[TCA_STAB_BASE]); if (s->tsize > 0) { if (!tb[TCA_STAB_DATA]) { NL_SET_ERR_MSG(extack, "Size table data attribute is missing"); return ERR_PTR(-EINVAL); } tab = nla_data(tb[TCA_STAB_DATA]); tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16); } if (tsize != s->tsize || (!tab && tsize > 0)) { NL_SET_ERR_MSG(extack, "Invalid size of size table"); return ERR_PTR(-EINVAL); } list_for_each_entry(stab, &qdisc_stab_list, list) { if (memcmp(&stab->szopts, s, sizeof(*s))) continue; if (tsize > 0 && memcmp(stab->data, tab, flex_array_size(stab, data, tsize))) continue; stab->refcnt++; return stab; } if (s->size_log > STAB_SIZE_LOG_MAX || s->cell_log > STAB_SIZE_LOG_MAX) { NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table"); return ERR_PTR(-EINVAL); } stab = kmalloc(struct_size(stab, data, tsize), GFP_KERNEL); if (!stab) return ERR_PTR(-ENOMEM); stab->refcnt = 1; stab->szopts = *s; if (tsize > 0) memcpy(stab->data, tab, flex_array_size(stab, data, tsize)); list_add_tail(&stab->list, &qdisc_stab_list); return stab; } void qdisc_put_stab(struct qdisc_size_table *tab) { if (!tab) return; if (--tab->refcnt == 0) { list_del(&tab->list); kfree_rcu(tab, rcu); } } EXPORT_SYMBOL(qdisc_put_stab); static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab) { struct nlattr *nest; nest = nla_nest_start_noflag(skb, TCA_STAB); if (nest == NULL) goto nla_put_failure; if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts)) goto nla_put_failure; nla_nest_end(skb, nest); return skb->len; nla_put_failure: return -1; } void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab) { int pkt_len, slot; pkt_len = skb->len + stab->szopts.overhead; if (unlikely(!stab->szopts.tsize)) goto out; slot = pkt_len + stab->szopts.cell_align; if (unlikely(slot < 0)) slot = 0; slot >>= stab->szopts.cell_log; if (likely(slot < stab->szopts.tsize)) pkt_len = stab->data[slot]; else pkt_len = stab->data[stab->szopts.tsize - 1] * (slot / stab->szopts.tsize) + stab->data[slot % stab->szopts.tsize]; pkt_len <<= stab->szopts.size_log; out: if (unlikely(pkt_len < 1)) pkt_len = 1; qdisc_skb_cb(skb)->pkt_len = pkt_len; } EXPORT_SYMBOL(__qdisc_calculate_pkt_len); void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc) { if (!(qdisc->flags & TCQ_F_WARN_NONWC)) { pr_warn("%s: %s qdisc %X: is non-work-conserving?\n", txt, qdisc->ops->id, qdisc->handle >> 16); qdisc->flags |= TCQ_F_WARN_NONWC; } } EXPORT_SYMBOL(qdisc_warn_nonwc); static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer) { struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog, timer); rcu_read_lock(); __netif_schedule(qdisc_root(wd->qdisc)); rcu_read_unlock(); return HRTIMER_NORESTART; } void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc, clockid_t clockid) { hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED); wd->timer.function = qdisc_watchdog; wd->qdisc = qdisc; } EXPORT_SYMBOL(qdisc_watchdog_init_clockid); void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) { qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC); } EXPORT_SYMBOL(qdisc_watchdog_init); void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires, u64 delta_ns) { bool deactivated; rcu_read_lock(); deactivated = test_bit(__QDISC_STATE_DEACTIVATED, &qdisc_root_sleeping(wd->qdisc)->state); rcu_read_unlock(); if (deactivated) return; if (hrtimer_is_queued(&wd->timer)) { u64 softexpires; softexpires = ktime_to_ns(hrtimer_get_softexpires(&wd->timer)); /* If timer is already set in [expires, expires + delta_ns], * do not reprogram it. */ if (softexpires - expires <= delta_ns) return; } hrtimer_start_range_ns(&wd->timer, ns_to_ktime(expires), delta_ns, HRTIMER_MODE_ABS_PINNED); } EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns); void qdisc_watchdog_cancel(struct qdisc_watchdog *wd) { hrtimer_cancel(&wd->timer); } EXPORT_SYMBOL(qdisc_watchdog_cancel); static struct hlist_head *qdisc_class_hash_alloc(unsigned int n) { struct hlist_head *h; unsigned int i; h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL); if (h != NULL) { for (i = 0; i < n; i++) INIT_HLIST_HEAD(&h[i]); } return h; } void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash) { struct Qdisc_class_common *cl; struct hlist_node *next; struct hlist_head *nhash, *ohash; unsigned int nsize, nmask, osize; unsigned int i, h; /* Rehash when load factor exceeds 0.75 */ if (clhash->hashelems * 4 <= clhash->hashsize * 3) return; nsize = clhash->hashsize * 2; nmask = nsize - 1; nhash = qdisc_class_hash_alloc(nsize); if (nhash == NULL) return; ohash = clhash->hash; osize = clhash->hashsize; sch_tree_lock(sch); for (i = 0; i < osize; i++) { hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) { h = qdisc_class_hash(cl->classid, nmask); hlist_add_head(&cl->hnode, &nhash[h]); } } clhash->hash = nhash; clhash->hashsize = nsize; clhash->hashmask = nmask; sch_tree_unlock(sch); kvfree(ohash); } EXPORT_SYMBOL(qdisc_class_hash_grow); int qdisc_class_hash_init(struct Qdisc_class_hash *clhash) { unsigned int size = 4; clhash->hash = qdisc_class_hash_alloc(size); if (!clhash->hash) return -ENOMEM; clhash->hashsize = size; clhash->hashmask = size - 1; clhash->hashelems = 0; return 0; } EXPORT_SYMBOL(qdisc_class_hash_init); void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash) { kvfree(clhash->hash); } EXPORT_SYMBOL(qdisc_class_hash_destroy); void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash, struct Qdisc_class_common *cl) { unsigned int h; INIT_HLIST_NODE(&cl->hnode); h = qdisc_class_hash(cl->classid, clhash->hashmask); hlist_add_head(&cl->hnode, &clhash->hash[h]); clhash->hashelems++; } EXPORT_SYMBOL(qdisc_class_hash_insert); void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash, struct Qdisc_class_common *cl) { hlist_del(&cl->hnode); clhash->hashelems--; } EXPORT_SYMBOL(qdisc_class_hash_remove); /* Allocate an unique handle from space managed by kernel * Possible range is [8000-FFFF]:0000 (0x8000 values) */ static u32 qdisc_alloc_handle(struct net_device *dev) { int i = 0x8000; static u32 autohandle = TC_H_MAKE(0x80000000U, 0); do { autohandle += TC_H_MAKE(0x10000U, 0); if (autohandle == TC_H_MAKE(TC_H_ROOT, 0)) autohandle = TC_H_MAKE(0x80000000U, 0); if (!qdisc_lookup(dev, autohandle)) return autohandle; cond_resched(); } while (--i > 0); return 0; } void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len) { bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED; const struct Qdisc_class_ops *cops; unsigned long cl; u32 parentid; bool notify; int drops; if (n == 0 && len == 0) return; drops = max_t(int, n, 0); rcu_read_lock(); while ((parentid = sch->parent)) { if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS)) break; if (sch->flags & TCQ_F_NOPARENT) break; /* Notify parent qdisc only if child qdisc becomes empty. * * If child was empty even before update then backlog * counter is screwed and we skip notification because * parent class is already passive. * * If the original child was offloaded then it is allowed * to be seem as empty, so the parent is notified anyway. */ notify = !sch->q.qlen && !WARN_ON_ONCE(!n && !qdisc_is_offloaded); /* TODO: perform the search on a per txq basis */ sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid)); if (sch == NULL) { WARN_ON_ONCE(parentid != TC_H_ROOT); break; } cops = sch->ops->cl_ops; if (notify && cops->qlen_notify) { cl = cops->find(sch, parentid); cops->qlen_notify(sch, cl); } sch->q.qlen -= n; sch->qstats.backlog -= len; __qdisc_qstats_drop(sch, drops); } rcu_read_unlock(); } EXPORT_SYMBOL(qdisc_tree_reduce_backlog); int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type, void *type_data) { struct net_device *dev = qdisc_dev(sch); int err; sch->flags &= ~TCQ_F_OFFLOADED; if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) return 0; err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data); if (err == -EOPNOTSUPP) return 0; if (!err) sch->flags |= TCQ_F_OFFLOADED; return err; } EXPORT_SYMBOL(qdisc_offload_dump_helper); void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch, struct Qdisc *new, struct Qdisc *old, enum tc_setup_type type, void *type_data, struct netlink_ext_ack *extack) { bool any_qdisc_is_offloaded; int err; if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) return; err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data); /* Don't report error if the graft is part of destroy operation. */ if (!err || !new || new == &noop_qdisc) return; /* Don't report error if the parent, the old child and the new * one are not offloaded. */ any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED; any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED; any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED; if (any_qdisc_is_offloaded) NL_SET_ERR_MSG(extack, "Offloading graft operation failed."); } EXPORT_SYMBOL(qdisc_offload_graft_helper); void qdisc_offload_query_caps(struct net_device *dev, enum tc_setup_type type, void *caps, size_t caps_len) { const struct net_device_ops *ops = dev->netdev_ops; struct tc_query_caps_base base = { .type = type, .caps = caps, }; memset(caps, 0, caps_len); if (ops->ndo_setup_tc) ops->ndo_setup_tc(dev, TC_QUERY_CAPS, &base); } EXPORT_SYMBOL(qdisc_offload_query_caps); static void qdisc_offload_graft_root(struct net_device *dev, struct Qdisc *new, struct Qdisc *old, struct netlink_ext_ack *extack) { struct tc_root_qopt_offload graft_offload = { .command = TC_ROOT_GRAFT, .handle = new ? new->handle : 0, .ingress = (new && new->flags & TCQ_F_INGRESS) || (old && old->flags & TCQ_F_INGRESS), }; qdisc_offload_graft_helper(dev, NULL, new, old, TC_SETUP_ROOT_QDISC, &graft_offload, extack); } static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, u32 portid, u32 seq, u16 flags, int event, struct netlink_ext_ack *extack) { struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL; struct gnet_stats_queue __percpu *cpu_qstats = NULL; struct tcmsg *tcm; struct nlmsghdr *nlh; unsigned char *b = skb_tail_pointer(skb); struct gnet_dump d; struct qdisc_size_table *stab; u32 block_index; __u32 qlen; cond_resched(); nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); if (!nlh) goto out_nlmsg_trim; tcm = nlmsg_data(nlh); tcm->tcm_family = AF_UNSPEC; tcm->tcm__pad1 = 0; tcm->tcm__pad2 = 0; tcm->tcm_ifindex = qdisc_dev(q)->ifindex; tcm->tcm_parent = clid; tcm->tcm_handle = q->handle; tcm->tcm_info = refcount_read(&q->refcnt); if (nla_put_string(skb, TCA_KIND, q->ops->id)) goto nla_put_failure; if (q->ops->ingress_block_get) { block_index = q->ops->ingress_block_get(q); if (block_index && nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index)) goto nla_put_failure; } if (q->ops->egress_block_get) { block_index = q->ops->egress_block_get(q); if (block_index && nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index)) goto nla_put_failure; } if (q->ops->dump && q->ops->dump(q, skb) < 0) goto nla_put_failure; if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED))) goto nla_put_failure; qlen = qdisc_qlen_sum(q); stab = rtnl_dereference(q->stab); if (stab && qdisc_dump_stab(skb, stab) < 0) goto nla_put_failure; if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS, NULL, &d, TCA_PAD) < 0) goto nla_put_failure; if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0) goto nla_put_failure; if (qdisc_is_percpu_stats(q)) { cpu_bstats = q->cpu_bstats; cpu_qstats = q->cpu_qstats; } if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats, true) < 0 || gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 || gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0) goto nla_put_failure; if (gnet_stats_finish_copy(&d) < 0) goto nla_put_failure; if (extack && extack->_msg && nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) goto out_nlmsg_trim; nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; out_nlmsg_trim: nla_put_failure: nlmsg_trim(skb, b); return -1; } static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible) { if (q->flags & TCQ_F_BUILTIN) return true; if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible) return true; return false; } static int qdisc_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, u32 clid, struct Qdisc *old, struct Qdisc *new, struct netlink_ext_ack *extack) { struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; if (old && !tc_qdisc_dump_ignore(old, false)) { if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq, 0, RTM_DELQDISC, extack) < 0) goto err_out; } if (new && !tc_qdisc_dump_ignore(new, false)) { if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC, extack) < 0) goto err_out; } if (skb->len) return rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); err_out: kfree_skb(skb); return -EINVAL; } static void notify_and_destroy(struct net *net, struct sk_buff *skb, struct nlmsghdr *n, u32 clid, struct Qdisc *old, struct Qdisc *new, struct netlink_ext_ack *extack) { if (new || old) qdisc_notify(net, skb, n, clid, old, new, extack); if (old) qdisc_put(old); } static void qdisc_clear_nolock(struct Qdisc *sch) { sch->flags &= ~TCQ_F_NOLOCK; if (!(sch->flags & TCQ_F_CPUSTATS)) return; free_percpu(sch->cpu_bstats); free_percpu(sch->cpu_qstats); sch->cpu_bstats = NULL; sch->cpu_qstats = NULL; sch->flags &= ~TCQ_F_CPUSTATS; } /* Graft qdisc "new" to class "classid" of qdisc "parent" or * to device "dev". * * When appropriate send a netlink notification using 'skb' * and "n". * * On success, destroy old qdisc. */ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, struct sk_buff *skb, struct nlmsghdr *n, u32 classid, struct Qdisc *new, struct Qdisc *old, struct netlink_ext_ack *extack) { struct Qdisc *q = old; struct net *net = dev_net(dev); if (parent == NULL) { unsigned int i, num_q, ingress; struct netdev_queue *dev_queue; ingress = 0; num_q = dev->num_tx_queues; if ((q && q->flags & TCQ_F_INGRESS) || (new && new->flags & TCQ_F_INGRESS)) { ingress = 1; dev_queue = dev_ingress_queue(dev); if (!dev_queue) { NL_SET_ERR_MSG(extack, "Device does not have an ingress queue"); return -ENOENT; } q = rtnl_dereference(dev_queue->qdisc_sleeping); /* This is the counterpart of that qdisc_refcount_inc_nz() call in * __tcf_qdisc_find() for filter requests. */ if (!qdisc_refcount_dec_if_one(q)) { NL_SET_ERR_MSG(extack, "Current ingress or clsact Qdisc has ongoing filter requests"); return -EBUSY; } } if (dev->flags & IFF_UP) dev_deactivate(dev); qdisc_offload_graft_root(dev, new, old, extack); if (new && new->ops->attach && !ingress) goto skip; if (!ingress) { for (i = 0; i < num_q; i++) { dev_queue = netdev_get_tx_queue(dev, i); old = dev_graft_qdisc(dev_queue, new); if (new && i > 0) qdisc_refcount_inc(new); qdisc_put(old); } } else { old = dev_graft_qdisc(dev_queue, NULL); /* {ingress,clsact}_destroy() @old before grafting @new to avoid * unprotected concurrent accesses to net_device::miniq_{in,e}gress * pointer(s) in mini_qdisc_pair_swap(). */ qdisc_notify(net, skb, n, classid, old, new, extack); qdisc_destroy(old); dev_graft_qdisc(dev_queue, new); } skip: if (!ingress) { old = rtnl_dereference(dev->qdisc); if (new && !new->ops->attach) qdisc_refcount_inc(new); rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc); notify_and_destroy(net, skb, n, classid, old, new, extack); if (new && new->ops->attach) new->ops->attach(new); } if (dev->flags & IFF_UP) dev_activate(dev); } else { const struct Qdisc_class_ops *cops = parent->ops->cl_ops; unsigned long cl; int err; /* Only support running class lockless if parent is lockless */ if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK)) qdisc_clear_nolock(new); if (!cops || !cops->graft) return -EOPNOTSUPP; cl = cops->find(parent, classid); if (!cl) { NL_SET_ERR_MSG(extack, "Specified class not found"); return -ENOENT; } if (new && new->ops == &noqueue_qdisc_ops) { NL_SET_ERR_MSG(extack, "Cannot assign noqueue to a class"); return -EINVAL; } err = cops->graft(parent, cl, new, &old, extack); if (err) return err; notify_and_destroy(net, skb, n, classid, old, new, extack); } return 0; } static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca, struct netlink_ext_ack *extack) { u32 block_index; if (tca[TCA_INGRESS_BLOCK]) { block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]); if (!block_index) { NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0"); return -EINVAL; } if (!sch->ops->ingress_block_set) { NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported"); return -EOPNOTSUPP; } sch->ops->ingress_block_set(sch, block_index); } if (tca[TCA_EGRESS_BLOCK]) { block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]); if (!block_index) { NL_SET_ERR_MSG(extack, "Egress block index cannot be 0"); return -EINVAL; } if (!sch->ops->egress_block_set) { NL_SET_ERR_MSG(extack, "Egress block sharing is not supported"); return -EOPNOTSUPP; } sch->ops->egress_block_set(sch, block_index); } return 0; } /* Allocate and initialize new qdisc. Parameters are passed via opt. */ static struct Qdisc *qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue, u32 parent, u32 handle, struct nlattr **tca, int *errp, struct netlink_ext_ack *extack) { int err; struct nlattr *kind = tca[TCA_KIND]; struct Qdisc *sch; struct Qdisc_ops *ops; struct qdisc_size_table *stab; ops = qdisc_lookup_ops(kind); #ifdef CONFIG_MODULES if (ops == NULL && kind != NULL) { char name[IFNAMSIZ]; if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) { /* We dropped the RTNL semaphore in order to * perform the module load. So, even if we * succeeded in loading the module we have to * tell the caller to replay the request. We * indicate this using -EAGAIN. * We replay the request because the device may * go away in the mean time. */ rtnl_unlock(); request_module("sch_%s", name); rtnl_lock(); ops = qdisc_lookup_ops(kind); if (ops != NULL) { /* We will try again qdisc_lookup_ops, * so don't keep a reference. */ module_put(ops->owner); err = -EAGAIN; goto err_out; } } } #endif err = -ENOENT; if (!ops) { NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown"); goto err_out; } sch = qdisc_alloc(dev_queue, ops, extack); if (IS_ERR(sch)) { err = PTR_ERR(sch); goto err_out2; } sch->parent = parent; if (handle == TC_H_INGRESS) { if (!(sch->flags & TCQ_F_INGRESS)) { NL_SET_ERR_MSG(extack, "Specified parent ID is reserved for ingress and clsact Qdiscs"); err = -EINVAL; goto err_out3; } handle = TC_H_MAKE(TC_H_INGRESS, 0); } else { if (handle == 0) { handle = qdisc_alloc_handle(dev); if (handle == 0) { NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded"); err = -ENOSPC; goto err_out3; } } if (!netif_is_multiqueue(dev)) sch->flags |= TCQ_F_ONETXQUEUE; } sch->handle = handle; /* This exist to keep backward compatible with a userspace * loophole, what allowed userspace to get IFF_NO_QUEUE * facility on older kernels by setting tx_queue_len=0 (prior * to qdisc init), and then forgot to reinit tx_queue_len * before again attaching a qdisc. */ if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) { dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; netdev_info(dev, "Caught tx_queue_len zero misconfig\n"); } err = qdisc_block_indexes_set(sch, tca, extack); if (err) goto err_out3; if (tca[TCA_STAB]) { stab = qdisc_get_stab(tca[TCA_STAB], extack); if (IS_ERR(stab)) { err = PTR_ERR(stab); goto err_out3; } rcu_assign_pointer(sch->stab, stab); } if (ops->init) { err = ops->init(sch, tca[TCA_OPTIONS], extack); if (err != 0) goto err_out4; } if (tca[TCA_RATE]) { err = -EOPNOTSUPP; if (sch->flags & TCQ_F_MQROOT) { NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc"); goto err_out4; } err = gen_new_estimator(&sch->bstats, sch->cpu_bstats, &sch->rate_est, NULL, true, tca[TCA_RATE]); if (err) { NL_SET_ERR_MSG(extack, "Failed to generate new estimator"); goto err_out4; } } qdisc_hash_add(sch, false); trace_qdisc_create(ops, dev, parent); return sch; err_out4: /* Even if ops->init() failed, we call ops->destroy() * like qdisc_create_dflt(). */ if (ops->destroy) ops->destroy(sch); qdisc_put_stab(rtnl_dereference(sch->stab)); err_out3: netdev_put(dev, &sch->dev_tracker); qdisc_free(sch); err_out2: module_put(ops->owner); err_out: *errp = err; return NULL; } static int qdisc_change(struct Qdisc *sch, struct nlattr **tca, struct netlink_ext_ack *extack) { struct qdisc_size_table *ostab, *stab = NULL; int err = 0; if (tca[TCA_OPTIONS]) { if (!sch->ops->change) { NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc"); return -EINVAL; } if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) { NL_SET_ERR_MSG(extack, "Change of blocks is not supported"); return -EOPNOTSUPP; } err = sch->ops->change(sch, tca[TCA_OPTIONS], extack); if (err) return err; } if (tca[TCA_STAB]) { stab = qdisc_get_stab(tca[TCA_STAB], extack); if (IS_ERR(stab)) return PTR_ERR(stab); } ostab = rtnl_dereference(sch->stab); rcu_assign_pointer(sch->stab, stab); qdisc_put_stab(ostab); if (tca[TCA_RATE]) { /* NB: ignores errors from replace_estimator because change can't be undone. */ if (sch->flags & TCQ_F_MQROOT) goto out; gen_replace_estimator(&sch->bstats, sch->cpu_bstats, &sch->rate_est, NULL, true, tca[TCA_RATE]); } out: return 0; } struct check_loop_arg { struct qdisc_walker w; struct Qdisc *p; int depth; }; static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w); static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth) { struct check_loop_arg arg; if (q->ops->cl_ops == NULL) return 0; arg.w.stop = arg.w.skip = arg.w.count = 0; arg.w.fn = check_loop_fn; arg.depth = depth; arg.p = p; q->ops->cl_ops->walk(q, &arg.w); return arg.w.stop ? -ELOOP : 0; } static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) { struct Qdisc *leaf; const struct Qdisc_class_ops *cops = q->ops->cl_ops; struct check_loop_arg *arg = (struct check_loop_arg *)w; leaf = cops->leaf(q, cl); if (leaf) { if (leaf == arg->p || arg->depth > 7) return -ELOOP; return check_loop(leaf, arg->p, arg->depth + 1); } return 0; } const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = { [TCA_KIND] = { .type = NLA_STRING }, [TCA_RATE] = { .type = NLA_BINARY, .len = sizeof(struct tc_estimator) }, [TCA_STAB] = { .type = NLA_NESTED }, [TCA_DUMP_INVISIBLE] = { .type = NLA_FLAG }, [TCA_CHAIN] = { .type = NLA_U32 }, [TCA_INGRESS_BLOCK] = { .type = NLA_U32 }, [TCA_EGRESS_BLOCK] = { .type = NLA_U32 }, }; /* * Delete/get qdisc. */ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct tcmsg *tcm = nlmsg_data(n); struct nlattr *tca[TCA_MAX + 1]; struct net_device *dev; u32 clid; struct Qdisc *q = NULL; struct Qdisc *p = NULL; int err; err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy, extack); if (err < 0) return err; dev = __dev_get_by_index(net, tcm->tcm_ifindex); if (!dev) return -ENODEV; clid = tcm->tcm_parent; if (clid) { if (clid != TC_H_ROOT) { if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) { p = qdisc_lookup(dev, TC_H_MAJ(clid)); if (!p) { NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid"); return -ENOENT; } q = qdisc_leaf(p, clid); } else if (dev_ingress_queue(dev)) { q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping); } } else { q = rtnl_dereference(dev->qdisc); } if (!q) { NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device"); return -ENOENT; } if (tcm->tcm_handle && q->handle != tcm->tcm_handle) { NL_SET_ERR_MSG(extack, "Invalid handle"); return -EINVAL; } } else { q = qdisc_lookup(dev, tcm->tcm_handle); if (!q) { NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle"); return -ENOENT; } } if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) { NL_SET_ERR_MSG(extack, "Invalid qdisc name"); return -EINVAL; } if (n->nlmsg_type == RTM_DELQDISC) { if (!clid) { NL_SET_ERR_MSG(extack, "Classid cannot be zero"); return -EINVAL; } if (q->handle == 0) { NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero"); return -ENOENT; } err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack); if (err != 0) return err; } else { qdisc_notify(net, skb, n, clid, NULL, q, NULL); } return 0; } static bool req_create_or_replace(struct nlmsghdr *n) { return (n->nlmsg_flags & NLM_F_CREATE && n->nlmsg_flags & NLM_F_REPLACE); } static bool req_create_exclusive(struct nlmsghdr *n) { return (n->nlmsg_flags & NLM_F_CREATE && n->nlmsg_flags & NLM_F_EXCL); } static bool req_change(struct nlmsghdr *n) { return (!(n->nlmsg_flags & NLM_F_CREATE) && !(n->nlmsg_flags & NLM_F_REPLACE) && !(n->nlmsg_flags & NLM_F_EXCL)); } /* * Create/change qdisc. */ static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct tcmsg *tcm; struct nlattr *tca[TCA_MAX + 1]; struct net_device *dev; u32 clid; struct Qdisc *q, *p; int err; replay: /* Reinit, just in case something touches this. */ err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy, extack); if (err < 0) return err; tcm = nlmsg_data(n); clid = tcm->tcm_parent; q = p = NULL; dev = __dev_get_by_index(net, tcm->tcm_ifindex); if (!dev) return -ENODEV; if (clid) { if (clid != TC_H_ROOT) { if (clid != TC_H_INGRESS) { p = qdisc_lookup(dev, TC_H_MAJ(clid)); if (!p) { NL_SET_ERR_MSG(extack, "Failed to find specified qdisc"); return -ENOENT; } q = qdisc_leaf(p, clid); } else if (dev_ingress_queue_create(dev)) { q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping); } } else { q = rtnl_dereference(dev->qdisc); } /* It may be default qdisc, ignore it */ if (q && q->handle == 0) q = NULL; if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) { if (tcm->tcm_handle) { if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) { NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override"); return -EEXIST; } if (TC_H_MIN(tcm->tcm_handle)) { NL_SET_ERR_MSG(extack, "Invalid minor handle"); return -EINVAL; } q = qdisc_lookup(dev, tcm->tcm_handle); if (!q) goto create_n_graft; if (n->nlmsg_flags & NLM_F_EXCL) { NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override"); return -EEXIST; } if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) { NL_SET_ERR_MSG(extack, "Invalid qdisc name"); return -EINVAL; } if (q->flags & TCQ_F_INGRESS) { NL_SET_ERR_MSG(extack, "Cannot regraft ingress or clsact Qdiscs"); return -EINVAL; } if (q == p || (p && check_loop(q, p, 0))) { NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected"); return -ELOOP; } if (clid == TC_H_INGRESS) { NL_SET_ERR_MSG(extack, "Ingress cannot graft directly"); return -EINVAL; } qdisc_refcount_inc(q); goto graft; } else { if (!q) goto create_n_graft; /* This magic test requires explanation. * * We know, that some child q is already * attached to this parent and have choice: * 1) change it or 2) create/graft new one. * If the requested qdisc kind is different * than the existing one, then we choose graft. * If they are the same then this is "change" * operation - just let it fallthrough.. * * 1. We are allowed to create/graft only * if the request is explicitly stating * "please create if it doesn't exist". * * 2. If the request is to exclusive create * then the qdisc tcm_handle is not expected * to exist, so that we choose create/graft too. * * 3. The last case is when no flags are set. * This will happen when for example tc * utility issues a "change" command. * Alas, it is sort of hole in API, we * cannot decide what to do unambiguously. * For now we select create/graft. */ if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) { if (req_create_or_replace(n) || req_create_exclusive(n)) goto create_n_graft; else if (req_change(n)) goto create_n_graft2; } } } } else { if (!tcm->tcm_handle) { NL_SET_ERR_MSG(extack, "Handle cannot be zero"); return -EINVAL; } q = qdisc_lookup(dev, tcm->tcm_handle); } /* Change qdisc parameters */ if (!q) { NL_SET_ERR_MSG(extack, "Specified qdisc not found"); return -ENOENT; } if (n->nlmsg_flags & NLM_F_EXCL) { NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify"); return -EEXIST; } if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) { NL_SET_ERR_MSG(extack, "Invalid qdisc name"); return -EINVAL; } err = qdisc_change(q, tca, extack); if (err == 0) qdisc_notify(net, skb, n, clid, NULL, q, extack); return err; create_n_graft: if (!(n->nlmsg_flags & NLM_F_CREATE)) { NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag"); return -ENOENT; } create_n_graft2: if (clid == TC_H_INGRESS) { if (dev_ingress_queue(dev)) { q = qdisc_create(dev, dev_ingress_queue(dev), tcm->tcm_parent, tcm->tcm_parent, tca, &err, extack); } else { NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device"); err = -ENOENT; } } else { struct netdev_queue *dev_queue; if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue) dev_queue = p->ops->cl_ops->select_queue(p, tcm); else if (p) dev_queue = p->dev_queue; else dev_queue = netdev_get_tx_queue(dev, 0); q = qdisc_create(dev, dev_queue, tcm->tcm_parent, tcm->tcm_handle, tca, &err, extack); } if (q == NULL) { if (err == -EAGAIN) goto replay; return err; } graft: err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack); if (err) { if (q) qdisc_put(q); return err; } return 0; } static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, struct netlink_callback *cb, int *q_idx_p, int s_q_idx, bool recur, bool dump_invisible) { int ret = 0, q_idx = *q_idx_p; struct Qdisc *q; int b; if (!root) return 0; q = root; if (q_idx < s_q_idx) { q_idx++; } else { if (!tc_qdisc_dump_ignore(q, dump_invisible) && tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC, NULL) <= 0) goto done; q_idx++; } /* If dumping singletons, there is no qdisc_dev(root) and the singleton * itself has already been dumped. * * If we've already dumped the top-level (ingress) qdisc above and the global * qdisc hashtable, we don't want to hit it again */ if (!qdisc_dev(root) || !recur) goto out; hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) { if (q_idx < s_q_idx) { q_idx++; continue; } if (!tc_qdisc_dump_ignore(q, dump_invisible) && tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC, NULL) <= 0) goto done; q_idx++; } out: *q_idx_p = q_idx; return ret; done: ret = -1; goto out; } static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); int idx, q_idx; int s_idx, s_q_idx; struct net_device *dev; const struct nlmsghdr *nlh = cb->nlh; struct nlattr *tca[TCA_MAX + 1]; int err; s_idx = cb->args[0]; s_q_idx = q_idx = cb->args[1]; idx = 0; ASSERT_RTNL(); err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX, rtm_tca_policy, cb->extack); if (err < 0) return err; for_each_netdev(net, dev) { struct netdev_queue *dev_queue; if (idx < s_idx) goto cont; if (idx > s_idx) s_q_idx = 0; q_idx = 0; if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc), skb, cb, &q_idx, s_q_idx, true, tca[TCA_DUMP_INVISIBLE]) < 0) goto done; dev_queue = dev_ingress_queue(dev); if (dev_queue && tc_dump_qdisc_root(rtnl_dereference(dev_queue->qdisc_sleeping), skb, cb, &q_idx, s_q_idx, false, tca[TCA_DUMP_INVISIBLE]) < 0) goto done; cont: idx++; } done: cb->args[0] = idx; cb->args[1] = q_idx; return skb->len; } /************************************************ * Traffic classes manipulation. * ************************************************/ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, unsigned long cl, u32 portid, u32 seq, u16 flags, int event, struct netlink_ext_ack *extack) { struct tcmsg *tcm; struct nlmsghdr *nlh; unsigned char *b = skb_tail_pointer(skb); struct gnet_dump d; const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops; cond_resched(); nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); if (!nlh) goto out_nlmsg_trim; tcm = nlmsg_data(nlh); tcm->tcm_family = AF_UNSPEC; tcm->tcm__pad1 = 0; tcm->tcm__pad2 = 0; tcm->tcm_ifindex = qdisc_dev(q)->ifindex; tcm->tcm_parent = q->handle; tcm->tcm_handle = q->handle; tcm->tcm_info = 0; if (nla_put_string(skb, TCA_KIND, q->ops->id)) goto nla_put_failure; if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0) goto nla_put_failure; if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS, NULL, &d, TCA_PAD) < 0) goto nla_put_failure; if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0) goto nla_put_failure; if (gnet_stats_finish_copy(&d) < 0) goto nla_put_failure; if (extack && extack->_msg && nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) goto out_nlmsg_trim; nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; out_nlmsg_trim: nla_put_failure: nlmsg_trim(skb, b); return -1; } static int tclass_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, struct Qdisc *q, unsigned long cl, int event, struct netlink_ext_ack *extack) { struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event, extack) < 0) { kfree_skb(skb); return -EINVAL; } return rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); } static int tclass_del_notify(struct net *net, const struct Qdisc_class_ops *cops, struct sk_buff *oskb, struct nlmsghdr *n, struct Qdisc *q, unsigned long cl, struct netlink_ext_ack *extack) { u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; struct sk_buff *skb; int err = 0; if (!cops->delete) return -EOPNOTSUPP; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, RTM_DELTCLASS, extack) < 0) { kfree_skb(skb); return -EINVAL; } err = cops->delete(q, cl, extack); if (err) { kfree_skb(skb); return err; } err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); return err; } #ifdef CONFIG_NET_CLS struct tcf_bind_args { struct tcf_walker w; unsigned long base; unsigned long cl; u32 classid; }; static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg) { struct tcf_bind_args *a = (void *)arg; if (n && tp->ops->bind_class) { struct Qdisc *q = tcf_block_q(tp->chain->block); sch_tree_lock(q); tp->ops->bind_class(n, a->classid, a->cl, q, a->base); sch_tree_unlock(q); } return 0; } struct tc_bind_class_args { struct qdisc_walker w; unsigned long new_cl; u32 portid; u32 clid; }; static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) { struct tc_bind_class_args *a = (struct tc_bind_class_args *)w; const struct Qdisc_class_ops *cops = q->ops->cl_ops; struct tcf_block *block; struct tcf_chain *chain; block = cops->tcf_block(q, cl, NULL); if (!block) return 0; for (chain = tcf_get_next_chain(block, NULL); chain; chain = tcf_get_next_chain(block, chain)) { struct tcf_proto *tp; for (tp = tcf_get_next_proto(chain, NULL); tp; tp = tcf_get_next_proto(chain, tp)) { struct tcf_bind_args arg = {}; arg.w.fn = tcf_node_bind; arg.classid = a->clid; arg.base = cl; arg.cl = a->new_cl; tp->ops->walk(tp, &arg.w, true); } } return 0; } static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid, unsigned long new_cl) { const struct Qdisc_class_ops *cops = q->ops->cl_ops; struct tc_bind_class_args args = {}; if (!cops->tcf_block) return; args.portid = portid; args.clid = clid; args.new_cl = new_cl; args.w.fn = tc_bind_class_walker; q->ops->cl_ops->walk(q, &args.w); } #else static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid, unsigned long new_cl) { } #endif static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct tcmsg *tcm = nlmsg_data(n); struct nlattr *tca[TCA_MAX + 1]; struct net_device *dev; struct Qdisc *q = NULL; const struct Qdisc_class_ops *cops; unsigned long cl = 0; unsigned long new_cl; u32 portid; u32 clid; u32 qid; int err; err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy, extack); if (err < 0) return err; dev = __dev_get_by_index(net, tcm->tcm_ifindex); if (!dev) return -ENODEV; /* parent == TC_H_UNSPEC - unspecified parent. parent == TC_H_ROOT - class is root, which has no parent. parent == X:0 - parent is root class. parent == X:Y - parent is a node in hierarchy. parent == 0:Y - parent is X:Y, where X:0 is qdisc. handle == 0:0 - generate handle from kernel pool. handle == 0:Y - class is X:Y, where X:0 is qdisc. handle == X:Y - clear. handle == X:0 - root class. */ /* Step 1. Determine qdisc handle X:0 */ portid = tcm->tcm_parent; clid = tcm->tcm_handle; qid = TC_H_MAJ(clid); if (portid != TC_H_ROOT) { u32 qid1 = TC_H_MAJ(portid); if (qid && qid1) { /* If both majors are known, they must be identical. */ if (qid != qid1) return -EINVAL; } else if (qid1) { qid = qid1; } else if (qid == 0) qid = rtnl_dereference(dev->qdisc)->handle; /* Now qid is genuine qdisc handle consistent * both with parent and child. * * TC_H_MAJ(portid) still may be unspecified, complete it now. */ if (portid) portid = TC_H_MAKE(qid, portid); } else { if (qid == 0) qid = rtnl_dereference(dev->qdisc)->handle; } /* OK. Locate qdisc */ q = qdisc_lookup(dev, qid); if (!q) return -ENOENT; /* An check that it supports classes */ cops = q->ops->cl_ops; if (cops == NULL) return -EINVAL; /* Now try to get class */ if (clid == 0) { if (portid == TC_H_ROOT) clid = qid; } else clid = TC_H_MAKE(qid, clid); if (clid) cl = cops->find(q, clid); if (cl == 0) { err = -ENOENT; if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags & NLM_F_CREATE)) goto out; } else { switch (n->nlmsg_type) { case RTM_NEWTCLASS: err = -EEXIST; if (n->nlmsg_flags & NLM_F_EXCL) goto out; break; case RTM_DELTCLASS: err = tclass_del_notify(net, cops, skb, n, q, cl, extack); /* Unbind the class with flilters with 0 */ tc_bind_tclass(q, portid, clid, 0); goto out; case RTM_GETTCLASS: err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS, extack); goto out; default: err = -EINVAL; goto out; } } if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) { NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes"); return -EOPNOTSUPP; } new_cl = cl; err = -EOPNOTSUPP; if (cops->change) err = cops->change(q, clid, portid, tca, &new_cl, extack); if (err == 0) { tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS, extack); /* We just create a new class, need to do reverse binding. */ if (cl != new_cl) tc_bind_tclass(q, portid, clid, new_cl); } out: return err; } struct qdisc_dump_args { struct qdisc_walker w; struct sk_buff *skb; struct netlink_callback *cb; }; static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg) { struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg; return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid, a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS, NULL); } static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb, struct tcmsg *tcm, struct netlink_callback *cb, int *t_p, int s_t) { struct qdisc_dump_args arg; if (tc_qdisc_dump_ignore(q, false) || *t_p < s_t || !q->ops->cl_ops || (tcm->tcm_parent && TC_H_MAJ(tcm->tcm_parent) != q->handle)) { (*t_p)++; return 0; } if (*t_p > s_t) memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); arg.w.fn = qdisc_class_dump; arg.skb = skb; arg.cb = cb; arg.w.stop = 0; arg.w.skip = cb->args[1]; arg.w.count = 0; q->ops->cl_ops->walk(q, &arg.w); cb->args[1] = arg.w.count; if (arg.w.stop) return -1; (*t_p)++; return 0; } static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb, struct tcmsg *tcm, struct netlink_callback *cb, int *t_p, int s_t, bool recur) { struct Qdisc *q; int b; if (!root) return 0; if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0) return -1; if (!qdisc_dev(root) || !recur) return 0; if (tcm->tcm_parent) { q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent)); if (q && q != root && tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0) return -1; return 0; } hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) { if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0) return -1; } return 0; } static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) { struct tcmsg *tcm = nlmsg_data(cb->nlh); struct net *net = sock_net(skb->sk); struct netdev_queue *dev_queue; struct net_device *dev; int t, s_t; if (nlmsg_len(cb->nlh) < sizeof(*tcm)) return 0; dev = dev_get_by_index(net, tcm->tcm_ifindex); if (!dev) return 0; s_t = cb->args[0]; t = 0; if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc), skb, tcm, cb, &t, s_t, true) < 0) goto done; dev_queue = dev_ingress_queue(dev); if (dev_queue && tc_dump_tclass_root(rtnl_dereference(dev_queue->qdisc_sleeping), skb, tcm, cb, &t, s_t, false) < 0) goto done; done: cb->args[0] = t; dev_put(dev); return skb->len; } #ifdef CONFIG_PROC_FS static int psched_show(struct seq_file *seq, void *v) { seq_printf(seq, "%08x %08x %08x %08x\n", (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1), 1000000, (u32)NSEC_PER_SEC / hrtimer_resolution); return 0; } static int __net_init psched_net_init(struct net *net) { struct proc_dir_entry *e; e = proc_create_single("psched", 0, net->proc_net, psched_show); if (e == NULL) return -ENOMEM; return 0; } static void __net_exit psched_net_exit(struct net *net) { remove_proc_entry("psched", net->proc_net); } #else static int __net_init psched_net_init(struct net *net) { return 0; } static void __net_exit psched_net_exit(struct net *net) { } #endif static struct pernet_operations psched_net_ops = { .init = psched_net_init, .exit = psched_net_exit, }; #if IS_ENABLED(CONFIG_RETPOLINE) DEFINE_STATIC_KEY_FALSE(tc_skip_wrapper); #endif static int __init pktsched_init(void) { int err; err = register_pernet_subsys(&psched_net_ops); if (err) { pr_err("pktsched_init: " "cannot initialize per netns operations\n"); return err; } register_qdisc(&pfifo_fast_ops); register_qdisc(&pfifo_qdisc_ops); register_qdisc(&bfifo_qdisc_ops); register_qdisc(&pfifo_head_drop_qdisc_ops); register_qdisc(&mq_qdisc_ops); register_qdisc(&noqueue_qdisc_ops); rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, 0); rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, 0); tc_wrapper_init(); return 0; } subsys_initcall(pktsched_init); |
1122 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright 2019 Google LLC */ #ifndef __LINUX_BLK_CRYPTO_H #define __LINUX_BLK_CRYPTO_H #include <linux/types.h> enum blk_crypto_mode_num { BLK_ENCRYPTION_MODE_INVALID, BLK_ENCRYPTION_MODE_AES_256_XTS, BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV, BLK_ENCRYPTION_MODE_ADIANTUM, BLK_ENCRYPTION_MODE_SM4_XTS, BLK_ENCRYPTION_MODE_MAX, }; #define BLK_CRYPTO_MAX_KEY_SIZE 64 /** * struct blk_crypto_config - an inline encryption key's crypto configuration * @crypto_mode: encryption algorithm this key is for * @data_unit_size: the data unit size for all encryption/decryptions with this * key. This is the size in bytes of each individual plaintext and * ciphertext. This is always a power of 2. It might be e.g. the * filesystem block size or the disk sector size. * @dun_bytes: the maximum number of bytes of DUN used when using this key */ struct blk_crypto_config { enum blk_crypto_mode_num crypto_mode; unsigned int data_unit_size; unsigned int dun_bytes; }; /** * struct blk_crypto_key - an inline encryption key * @crypto_cfg: the crypto configuration (like crypto_mode, key size) for this * key * @data_unit_size_bits: log2 of data_unit_size * @size: size of this key in bytes (determined by @crypto_cfg.crypto_mode) * @raw: the raw bytes of this key. Only the first @size bytes are used. * * A blk_crypto_key is immutable once created, and many bios can reference it at * the same time. It must not be freed until all bios using it have completed * and it has been evicted from all devices on which it may have been used. */ struct blk_crypto_key { struct blk_crypto_config crypto_cfg; unsigned int data_unit_size_bits; unsigned int size; u8 raw[BLK_CRYPTO_MAX_KEY_SIZE]; }; #define BLK_CRYPTO_MAX_IV_SIZE 32 #define BLK_CRYPTO_DUN_ARRAY_SIZE (BLK_CRYPTO_MAX_IV_SIZE / sizeof(u64)) /** * struct bio_crypt_ctx - an inline encryption context * @bc_key: the key, algorithm, and data unit size to use * @bc_dun: the data unit number (starting IV) to use * * A bio_crypt_ctx specifies that the contents of the bio will be encrypted (for * write requests) or decrypted (for read requests) inline by the storage device * or controller, or by the crypto API fallback. */ struct bio_crypt_ctx { const struct blk_crypto_key *bc_key; u64 bc_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; }; #include <linux/blk_types.h> #include <linux/blkdev.h> #ifdef CONFIG_BLK_INLINE_ENCRYPTION static inline bool bio_has_crypt_ctx(struct bio *bio) { return bio->bi_crypt_context; } void bio_crypt_set_ctx(struct bio *bio, const struct blk_crypto_key *key, const u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], gfp_t gfp_mask); bool bio_crypt_dun_is_contiguous(const struct bio_crypt_ctx *bc, unsigned int bytes, const u64 next_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]); int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, enum blk_crypto_mode_num crypto_mode, unsigned int dun_bytes, unsigned int data_unit_size); int blk_crypto_start_using_key(struct block_device *bdev, const struct blk_crypto_key *key); void blk_crypto_evict_key(struct block_device *bdev, const struct blk_crypto_key *key); bool blk_crypto_config_supported_natively(struct block_device *bdev, const struct blk_crypto_config *cfg); bool blk_crypto_config_supported(struct block_device *bdev, const struct blk_crypto_config *cfg); #else /* CONFIG_BLK_INLINE_ENCRYPTION */ static inline bool bio_has_crypt_ctx(struct bio *bio) { return false; } #endif /* CONFIG_BLK_INLINE_ENCRYPTION */ int __bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask); /** * bio_crypt_clone - clone bio encryption context * @dst: destination bio * @src: source bio * @gfp_mask: memory allocation flags * * If @src has an encryption context, clone it to @dst. * * Return: 0 on success, -ENOMEM if out of memory. -ENOMEM is only possible if * @gfp_mask doesn't include %__GFP_DIRECT_RECLAIM. */ static inline int bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask) { if (bio_has_crypt_ctx(src)) return __bio_crypt_clone(dst, src, gfp_mask); return 0; } #endif /* __LINUX_BLK_CRYPTO_H */ |
8622 8650 8533 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_WORD_AT_A_TIME_H #define _ASM_WORD_AT_A_TIME_H #include <linux/kernel.h> /* * This is largely generic for little-endian machines, but the * optimal byte mask counting is probably going to be something * that is architecture-specific. If you have a reliably fast * bit count instruction, that might be better than the multiply * and shift, for example. */ struct word_at_a_time { const unsigned long one_bits, high_bits; }; #define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x01), REPEAT_BYTE(0x80) } #ifdef CONFIG_64BIT /* * Jan Achrenius on G+: microoptimized version of * the simpler "(mask & ONEBYTES) * ONEBYTES >> 56" * that works for the bytemasks without having to * mask them first. */ static inline long count_masked_bytes(unsigned long mask) { return mask*0x0001020304050608ul >> 56; } #else /* 32-bit case */ /* Carl Chatfield / Jan Achrenius G+ version for 32-bit */ static inline long count_masked_bytes(long mask) { /* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */ long a = (0x0ff0001+mask) >> 23; /* Fix the 1 for 00 case */ return a & mask; } #endif /* Return nonzero if it has a zero */ static inline unsigned long has_zero(unsigned long a, unsigned long *bits, const struct word_at_a_time *c) { unsigned long mask = ((a - c->one_bits) & ~a) & c->high_bits; *bits = mask; return mask; } static inline unsigned long prep_zero_mask(unsigned long a, unsigned long bits, const struct word_at_a_time *c) { return bits; } static inline unsigned long create_zero_mask(unsigned long bits) { bits = (bits - 1) & ~bits; return bits >> 7; } /* The mask we created is directly usable as a bytemask */ #define zero_bytemask(mask) (mask) static inline unsigned long find_zero(unsigned long mask) { return count_masked_bytes(mask); } /* * Load an unaligned word from kernel space. * * In the (very unlikely) case of the word being a page-crosser * and the next page not being mapped, take the exception and * return zeroes in the non-existing part. */ static inline unsigned long load_unaligned_zeropad(const void *addr) { unsigned long ret; asm volatile( "1: mov %[mem], %[ret]\n" "2:\n" _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_ZEROPAD) : [ret] "=r" (ret) : [mem] "m" (*(unsigned long *)addr)); return ret; } #endif /* _ASM_WORD_AT_A_TIME_H */ |
1915 1919 4273 1356 1552 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 | /* SPDX-License-Identifier: GPL-2.0 */ /* * include/linux/pagevec.h * * In many places it is efficient to batch an operation up against multiple * folios. A folio_batch is a container which is used for that. */ #ifndef _LINUX_PAGEVEC_H #define _LINUX_PAGEVEC_H #include <linux/types.h> /* 15 pointers + header align the folio_batch structure to a power of two */ #define PAGEVEC_SIZE 15 struct folio; /** * struct folio_batch - A collection of folios. * * The folio_batch is used to amortise the cost of retrieving and * operating on a set of folios. The order of folios in the batch may be * significant (eg delete_from_page_cache_batch()). Some users of the * folio_batch store "exceptional" entries in it which can be removed * by calling folio_batch_remove_exceptionals(). */ struct folio_batch { unsigned char nr; bool percpu_pvec_drained; struct folio *folios[PAGEVEC_SIZE]; }; /** * folio_batch_init() - Initialise a batch of folios * @fbatch: The folio batch. * * A freshly initialised folio_batch contains zero folios. */ static inline void folio_batch_init(struct folio_batch *fbatch) { fbatch->nr = 0; fbatch->percpu_pvec_drained = false; } static inline void folio_batch_reinit(struct folio_batch *fbatch) { fbatch->nr = 0; } static inline unsigned int folio_batch_count(struct folio_batch *fbatch) { return fbatch->nr; } static inline unsigned int folio_batch_space(struct folio_batch *fbatch) { return PAGEVEC_SIZE - fbatch->nr; } /** * folio_batch_add() - Add a folio to a batch. * @fbatch: The folio batch. * @folio: The folio to add. * * The folio is added to the end of the batch. * The batch must have previously been initialised using folio_batch_init(). * * Return: The number of slots still available. */ static inline unsigned folio_batch_add(struct folio_batch *fbatch, struct folio *folio) { fbatch->folios[fbatch->nr++] = folio; return folio_batch_space(fbatch); } void __folio_batch_release(struct folio_batch *pvec); static inline void folio_batch_release(struct folio_batch *fbatch) { if (folio_batch_count(fbatch)) __folio_batch_release(fbatch); } void folio_batch_remove_exceptionals(struct folio_batch *fbatch); #endif /* _LINUX_PAGEVEC_H */ |
114 195 196 196 214 233 196 57 955 798 170 3 586 53 170 1884 1079 8 993 840 1138 452 276 819 14 213 692 583 582 117 299 416 299 4 413 5 21 272 4 2 1 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> */ #ifndef _NET_IPV6_H #define _NET_IPV6_H #include <linux/ipv6.h> #include <linux/hardirq.h> #include <linux/jhash.h> #include <linux/refcount.h> #include <linux/jump_label_ratelimit.h> #include <net/if_inet6.h> #include <net/flow.h> #include <net/flow_dissector.h> #include <net/inet_dscp.h> #include <net/snmp.h> #include <net/netns/hash.h> struct ip_tunnel_info; #define SIN6_LEN_RFC2133 24 #define IPV6_MAXPLEN 65535 /* * NextHeader field of IPv6 header */ #define NEXTHDR_HOP 0 /* Hop-by-hop option header. */ #define NEXTHDR_IPV4 4 /* IPv4 in IPv6 */ #define NEXTHDR_TCP 6 /* TCP segment. */ #define NEXTHDR_UDP 17 /* UDP message. */ #define NEXTHDR_IPV6 41 /* IPv6 in IPv6 */ #define NEXTHDR_ROUTING 43 /* Routing header. */ #define NEXTHDR_FRAGMENT 44 /* Fragmentation/reassembly header. */ #define NEXTHDR_GRE 47 /* GRE header. */ #define NEXTHDR_ESP 50 /* Encapsulating security payload. */ #define NEXTHDR_AUTH 51 /* Authentication header. */ #define NEXTHDR_ICMP 58 /* ICMP for IPv6. */ #define NEXTHDR_NONE 59 /* No next header */ #define NEXTHDR_DEST 60 /* Destination options header. */ #define NEXTHDR_SCTP 132 /* SCTP message. */ #define NEXTHDR_MOBILITY 135 /* Mobility header. */ #define NEXTHDR_MAX 255 #define IPV6_DEFAULT_HOPLIMIT 64 #define IPV6_DEFAULT_MCASTHOPS 1 /* Limits on Hop-by-Hop and Destination options. * * Per RFC8200 there is no limit on the maximum number or lengths of options in * Hop-by-Hop or Destination options other then the packet must fit in an MTU. * We allow configurable limits in order to mitigate potential denial of * service attacks. * * There are three limits that may be set: * - Limit the number of options in a Hop-by-Hop or Destination options * extension header * - Limit the byte length of a Hop-by-Hop or Destination options extension * header * - Disallow unknown options * * The limits are expressed in corresponding sysctls: * * ipv6.sysctl.max_dst_opts_cnt * ipv6.sysctl.max_hbh_opts_cnt * ipv6.sysctl.max_dst_opts_len * ipv6.sysctl.max_hbh_opts_len * * max_*_opts_cnt is the number of TLVs that are allowed for Destination * options or Hop-by-Hop options. If the number is less than zero then unknown * TLVs are disallowed and the number of known options that are allowed is the * absolute value. Setting the value to INT_MAX indicates no limit. * * max_*_opts_len is the length limit in bytes of a Destination or * Hop-by-Hop options extension header. Setting the value to INT_MAX * indicates no length limit. * * If a limit is exceeded when processing an extension header the packet is * silently discarded. */ /* Default limits for Hop-by-Hop and Destination options */ #define IP6_DEFAULT_MAX_DST_OPTS_CNT 8 #define IP6_DEFAULT_MAX_HBH_OPTS_CNT 8 #define IP6_DEFAULT_MAX_DST_OPTS_LEN INT_MAX /* No limit */ #define IP6_DEFAULT_MAX_HBH_OPTS_LEN INT_MAX /* No limit */ /* * Addr type * * type - unicast | multicast * scope - local | site | global * v4 - compat * v4mapped * any * loopback */ #define IPV6_ADDR_ANY 0x0000U #define IPV6_ADDR_UNICAST 0x0001U #define IPV6_ADDR_MULTICAST 0x0002U #define IPV6_ADDR_LOOPBACK 0x0010U #define IPV6_ADDR_LINKLOCAL 0x0020U #define IPV6_ADDR_SITELOCAL 0x0040U #define IPV6_ADDR_COMPATv4 0x0080U #define IPV6_ADDR_SCOPE_MASK 0x00f0U #define IPV6_ADDR_MAPPED 0x1000U /* * Addr scopes */ #define IPV6_ADDR_MC_SCOPE(a) \ ((a)->s6_addr[1] & 0x0f) /* nonstandard */ #define __IPV6_ADDR_SCOPE_INVALID -1 #define IPV6_ADDR_SCOPE_NODELOCAL 0x01 #define IPV6_ADDR_SCOPE_LINKLOCAL 0x02 #define IPV6_ADDR_SCOPE_SITELOCAL 0x05 #define IPV6_ADDR_SCOPE_ORGLOCAL 0x08 #define IPV6_ADDR_SCOPE_GLOBAL 0x0e /* * Addr flags */ #define IPV6_ADDR_MC_FLAG_TRANSIENT(a) \ ((a)->s6_addr[1] & 0x10) #define IPV6_ADDR_MC_FLAG_PREFIX(a) \ ((a)->s6_addr[1] & 0x20) #define IPV6_ADDR_MC_FLAG_RENDEZVOUS(a) \ ((a)->s6_addr[1] & 0x40) /* * fragmentation header */ struct frag_hdr { __u8 nexthdr; __u8 reserved; __be16 frag_off; __be32 identification; }; /* * Jumbo payload option, as described in RFC 2675 2. */ struct hop_jumbo_hdr { u8 nexthdr; u8 hdrlen; u8 tlv_type; /* IPV6_TLV_JUMBO, 0xC2 */ u8 tlv_len; /* 4 */ __be32 jumbo_payload_len; }; #define IP6_MF 0x0001 #define IP6_OFFSET 0xFFF8 struct ip6_fraglist_iter { struct ipv6hdr *tmp_hdr; struct sk_buff *frag; int offset; unsigned int hlen; __be32 frag_id; u8 nexthdr; }; int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr, u8 nexthdr, __be32 frag_id, struct ip6_fraglist_iter *iter); void ip6_fraglist_prepare(struct sk_buff *skb, struct ip6_fraglist_iter *iter); static inline struct sk_buff *ip6_fraglist_next(struct ip6_fraglist_iter *iter) { struct sk_buff *skb = iter->frag; iter->frag = skb->next; skb_mark_not_on_list(skb); return skb; } struct ip6_frag_state { u8 *prevhdr; unsigned int hlen; unsigned int mtu; unsigned int left; int offset; int ptr; int hroom; int troom; __be32 frag_id; u8 nexthdr; }; void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu, unsigned short needed_tailroom, int hdr_room, u8 *prevhdr, u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state); struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state); #define IP6_REPLY_MARK(net, mark) \ ((net)->ipv6.sysctl.fwmark_reflect ? (mark) : 0) #include <net/sock.h> /* sysctls */ extern int sysctl_mld_max_msf; extern int sysctl_mld_qrv; #define _DEVINC(net, statname, mod, idev, field) \ ({ \ struct inet6_dev *_idev = (idev); \ if (likely(_idev != NULL)) \ mod##SNMP_INC_STATS64((_idev)->stats.statname, (field));\ mod##SNMP_INC_STATS64((net)->mib.statname##_statistics, (field));\ }) /* per device counters are atomic_long_t */ #define _DEVINCATOMIC(net, statname, mod, idev, field) \ ({ \ struct inet6_dev *_idev = (idev); \ if (likely(_idev != NULL)) \ SNMP_INC_STATS_ATOMIC_LONG((_idev)->stats.statname##dev, (field)); \ mod##SNMP_INC_STATS((net)->mib.statname##_statistics, (field));\ }) /* per device and per net counters are atomic_long_t */ #define _DEVINC_ATOMIC_ATOMIC(net, statname, idev, field) \ ({ \ struct inet6_dev *_idev = (idev); \ if (likely(_idev != NULL)) \ SNMP_INC_STATS_ATOMIC_LONG((_idev)->stats.statname##dev, (field)); \ SNMP_INC_STATS_ATOMIC_LONG((net)->mib.statname##_statistics, (field));\ }) #define _DEVADD(net, statname, mod, idev, field, val) \ ({ \ struct inet6_dev *_idev = (idev); \ if (likely(_idev != NULL)) \ mod##SNMP_ADD_STATS((_idev)->stats.statname, (field), (val)); \ mod##SNMP_ADD_STATS((net)->mib.statname##_statistics, (field), (val));\ }) #define _DEVUPD(net, statname, mod, idev, field, val) \ ({ \ struct inet6_dev *_idev = (idev); \ if (likely(_idev != NULL)) \ mod##SNMP_UPD_PO_STATS((_idev)->stats.statname, field, (val)); \ mod##SNMP_UPD_PO_STATS((net)->mib.statname##_statistics, field, (val));\ }) /* MIBs */ #define IP6_INC_STATS(net, idev,field) \ _DEVINC(net, ipv6, , idev, field) #define __IP6_INC_STATS(net, idev,field) \ _DEVINC(net, ipv6, __, idev, field) #define IP6_ADD_STATS(net, idev,field,val) \ _DEVADD(net, ipv6, , idev, field, val) #define __IP6_ADD_STATS(net, idev,field,val) \ _DEVADD(net, ipv6, __, idev, field, val) #define IP6_UPD_PO_STATS(net, idev,field,val) \ _DEVUPD(net, ipv6, , idev, field, val) #define __IP6_UPD_PO_STATS(net, idev,field,val) \ _DEVUPD(net, ipv6, __, idev, field, val) #define ICMP6_INC_STATS(net, idev, field) \ _DEVINCATOMIC(net, icmpv6, , idev, field) #define __ICMP6_INC_STATS(net, idev, field) \ _DEVINCATOMIC(net, icmpv6, __, idev, field) #define ICMP6MSGOUT_INC_STATS(net, idev, field) \ _DEVINC_ATOMIC_ATOMIC(net, icmpv6msg, idev, field +256) #define ICMP6MSGIN_INC_STATS(net, idev, field) \ _DEVINC_ATOMIC_ATOMIC(net, icmpv6msg, idev, field) struct ip6_ra_chain { struct ip6_ra_chain *next; struct sock *sk; int sel; void (*destructor)(struct sock *); }; extern struct ip6_ra_chain *ip6_ra_chain; extern rwlock_t ip6_ra_lock; /* This structure is prepared by protocol, when parsing ancillary data and passed to IPv6. */ struct ipv6_txoptions { refcount_t refcnt; /* Length of this structure */ int tot_len; /* length of extension headers */ __u16 opt_flen; /* after fragment hdr */ __u16 opt_nflen; /* before fragment hdr */ struct ipv6_opt_hdr *hopopt; struct ipv6_opt_hdr *dst0opt; struct ipv6_rt_hdr *srcrt; /* Routing Header */ struct ipv6_opt_hdr *dst1opt; struct rcu_head rcu; /* Option buffer, as read by IPV6_PKTOPTIONS, starts here. */ }; /* flowlabel_reflect sysctl values */ enum flowlabel_reflect { FLOWLABEL_REFLECT_ESTABLISHED = 1, FLOWLABEL_REFLECT_TCP_RESET = 2, FLOWLABEL_REFLECT_ICMPV6_ECHO_REPLIES = 4, }; struct ip6_flowlabel { struct ip6_flowlabel __rcu *next; __be32 label; atomic_t users; struct in6_addr dst; struct ipv6_txoptions *opt; unsigned long linger; struct rcu_head rcu; u8 share; union { struct pid *pid; kuid_t uid; } owner; unsigned long lastuse; unsigned long expires; struct net *fl_net; }; #define IPV6_FLOWINFO_MASK cpu_to_be32(0x0FFFFFFF) #define IPV6_FLOWLABEL_MASK cpu_to_be32(0x000FFFFF) #define IPV6_FLOWLABEL_STATELESS_FLAG cpu_to_be32(0x00080000) #define IPV6_TCLASS_MASK (IPV6_FLOWINFO_MASK & ~IPV6_FLOWLABEL_MASK) #define IPV6_TCLASS_SHIFT 20 struct ipv6_fl_socklist { struct ipv6_fl_socklist __rcu *next; struct ip6_flowlabel *fl; struct rcu_head rcu; }; struct ipcm6_cookie { struct sockcm_cookie sockc; __s16 hlimit; __s16 tclass; __u16 gso_size; __s8 dontfrag; struct ipv6_txoptions *opt; }; static inline void ipcm6_init(struct ipcm6_cookie *ipc6) { *ipc6 = (struct ipcm6_cookie) { .hlimit = -1, .tclass = -1, .dontfrag = -1, }; } static inline void ipcm6_init_sk(struct ipcm6_cookie *ipc6, const struct ipv6_pinfo *np) { *ipc6 = (struct ipcm6_cookie) { .hlimit = -1, .tclass = np->tclass, .dontfrag = np->dontfrag, }; } static inline struct ipv6_txoptions *txopt_get(const struct ipv6_pinfo *np) { struct ipv6_txoptions *opt; rcu_read_lock(); opt = rcu_dereference(np->opt); if (opt) { if (!refcount_inc_not_zero(&opt->refcnt)) opt = NULL; else opt = rcu_pointer_handoff(opt); } rcu_read_unlock(); return opt; } static inline void txopt_put(struct ipv6_txoptions *opt) { if (opt && refcount_dec_and_test(&opt->refcnt)) kfree_rcu(opt, rcu); } #if IS_ENABLED(CONFIG_IPV6) struct ip6_flowlabel *__fl6_sock_lookup(struct sock *sk, __be32 label); extern struct static_key_false_deferred ipv6_flowlabel_exclusive; static inline struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk, __be32 label) { if (static_branch_unlikely(&ipv6_flowlabel_exclusive.key) && READ_ONCE(sock_net(sk)->ipv6.flowlabel_has_excl)) return __fl6_sock_lookup(sk, label) ? : ERR_PTR(-ENOENT); return NULL; } #endif struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions *opt_space, struct ip6_flowlabel *fl, struct ipv6_txoptions *fopt); void fl6_free_socklist(struct sock *sk); int ipv6_flowlabel_opt(struct sock *sk, sockptr_t optval, int optlen); int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq, int flags); int ip6_flowlabel_init(void); void ip6_flowlabel_cleanup(void); bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np); static inline void fl6_sock_release(struct ip6_flowlabel *fl) { if (fl) atomic_dec(&fl->users); } enum skb_drop_reason icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info); void icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, struct icmp6hdr *thdr, int len); int ip6_ra_control(struct sock *sk, int sel); int ipv6_parse_hopopts(struct sk_buff *skb); struct ipv6_txoptions *ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt); struct ipv6_txoptions *ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, int newtype, struct ipv6_opt_hdr *newopt); struct ipv6_txoptions *__ipv6_fixup_options(struct ipv6_txoptions *opt_space, struct ipv6_txoptions *opt); static inline struct ipv6_txoptions * ipv6_fixup_options(struct ipv6_txoptions *opt_space, struct ipv6_txoptions *opt) { if (!opt) return NULL; return __ipv6_fixup_options(opt_space, opt); } bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb, const struct inet6_skb_parm *opt); struct ipv6_txoptions *ipv6_update_options(struct sock *sk, struct ipv6_txoptions *opt); /* This helper is specialized for BIG TCP needs. * It assumes the hop_jumbo_hdr will immediately follow the IPV6 header. * It assumes headers are already in skb->head. * Returns 0, or IPPROTO_TCP if a BIG TCP packet is there. */ static inline int ipv6_has_hopopt_jumbo(const struct sk_buff *skb) { const struct hop_jumbo_hdr *jhdr; const struct ipv6hdr *nhdr; if (likely(skb->len <= GRO_LEGACY_MAX_SIZE)) return 0; if (skb->protocol != htons(ETH_P_IPV6)) return 0; if (skb_network_offset(skb) + sizeof(struct ipv6hdr) + sizeof(struct hop_jumbo_hdr) > skb_headlen(skb)) return 0; nhdr = ipv6_hdr(skb); if (nhdr->nexthdr != NEXTHDR_HOP) return 0; jhdr = (const struct hop_jumbo_hdr *) (nhdr + 1); if (jhdr->tlv_type != IPV6_TLV_JUMBO || jhdr->hdrlen != 0 || jhdr->nexthdr != IPPROTO_TCP) return 0; return jhdr->nexthdr; } /* Return 0 if HBH header is successfully removed * Or if HBH removal is unnecessary (packet is not big TCP) * Return error to indicate dropping the packet */ static inline int ipv6_hopopt_jumbo_remove(struct sk_buff *skb) { const int hophdr_len = sizeof(struct hop_jumbo_hdr); int nexthdr = ipv6_has_hopopt_jumbo(skb); struct ipv6hdr *h6; if (!nexthdr) return 0; if (skb_cow_head(skb, 0)) return -1; /* Remove the HBH header. * Layout: [Ethernet header][IPv6 header][HBH][L4 Header] */ memmove(skb_mac_header(skb) + hophdr_len, skb_mac_header(skb), skb_network_header(skb) - skb_mac_header(skb) + sizeof(struct ipv6hdr)); __skb_pull(skb, hophdr_len); skb->network_header += hophdr_len; skb->mac_header += hophdr_len; h6 = ipv6_hdr(skb); h6->nexthdr = nexthdr; return 0; } static inline bool ipv6_accept_ra(struct inet6_dev *idev) { /* If forwarding is enabled, RA are not accepted unless the special * hybrid mode (accept_ra=2) is enabled. */ return idev->cnf.forwarding ? idev->cnf.accept_ra == 2 : idev->cnf.accept_ra; } #define IPV6_FRAG_HIGH_THRESH (4 * 1024*1024) /* 4194304 */ #define IPV6_FRAG_LOW_THRESH (3 * 1024*1024) /* 3145728 */ #define IPV6_FRAG_TIMEOUT (60 * HZ) /* 60 seconds */ int __ipv6_addr_type(const struct in6_addr *addr); static inline int ipv6_addr_type(const struct in6_addr *addr) { return __ipv6_addr_type(addr) & 0xffff; } static inline int ipv6_addr_scope(const struct in6_addr *addr) { return __ipv6_addr_type(addr) & IPV6_ADDR_SCOPE_MASK; } static inline int __ipv6_addr_src_scope(int type) { return (type == IPV6_ADDR_ANY) ? __IPV6_ADDR_SCOPE_INVALID : (type >> 16); } static inline int ipv6_addr_src_scope(const struct in6_addr *addr) { return __ipv6_addr_src_scope(__ipv6_addr_type(addr)); } static inline bool __ipv6_addr_needs_scope_id(int type) { return type & IPV6_ADDR_LINKLOCAL || (type & IPV6_ADDR_MULTICAST && (type & (IPV6_ADDR_LOOPBACK|IPV6_ADDR_LINKLOCAL))); } static inline __u32 ipv6_iface_scope_id(const struct in6_addr *addr, int iface) { return __ipv6_addr_needs_scope_id(__ipv6_addr_type(addr)) ? iface : 0; } static inline int ipv6_addr_cmp(const struct in6_addr *a1, const struct in6_addr *a2) { return memcmp(a1, a2, sizeof(struct in6_addr)); } static inline bool ipv6_masked_addr_cmp(const struct in6_addr *a1, const struct in6_addr *m, const struct in6_addr *a2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 const unsigned long *ul1 = (const unsigned long *)a1; const unsigned long *ulm = (const unsigned long *)m; const unsigned long *ul2 = (const unsigned long *)a2; return !!(((ul1[0] ^ ul2[0]) & ulm[0]) | ((ul1[1] ^ ul2[1]) & ulm[1])); #else return !!(((a1->s6_addr32[0] ^ a2->s6_addr32[0]) & m->s6_addr32[0]) | ((a1->s6_addr32[1] ^ a2->s6_addr32[1]) & m->s6_addr32[1]) | ((a1->s6_addr32[2] ^ a2->s6_addr32[2]) & m->s6_addr32[2]) | ((a1->s6_addr32[3] ^ a2->s6_addr32[3]) & m->s6_addr32[3])); #endif } static inline void ipv6_addr_prefix(struct in6_addr *pfx, const struct in6_addr *addr, int plen) { /* caller must guarantee 0 <= plen <= 128 */ int o = plen >> 3, b = plen & 0x7; memset(pfx->s6_addr, 0, sizeof(pfx->s6_addr)); memcpy(pfx->s6_addr, addr, o); if (b != 0) pfx->s6_addr[o] = addr->s6_addr[o] & (0xff00 >> b); } static inline void ipv6_addr_prefix_copy(struct in6_addr *addr, const struct in6_addr *pfx, int plen) { /* caller must guarantee 0 <= plen <= 128 */ int o = plen >> 3, b = plen & 0x7; memcpy(addr->s6_addr, pfx, o); if (b != 0) { addr->s6_addr[o] &= ~(0xff00 >> b); addr->s6_addr[o] |= (pfx->s6_addr[o] & (0xff00 >> b)); } } static inline void __ipv6_addr_set_half(__be32 *addr, __be32 wh, __be32 wl) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 #if defined(__BIG_ENDIAN) if (__builtin_constant_p(wh) && __builtin_constant_p(wl)) { *(__force u64 *)addr = ((__force u64)(wh) << 32 | (__force u64)(wl)); return; } #elif defined(__LITTLE_ENDIAN) if (__builtin_constant_p(wl) && __builtin_constant_p(wh)) { *(__force u64 *)addr = ((__force u64)(wl) << 32 | (__force u64)(wh)); return; } #endif #endif addr[0] = wh; addr[1] = wl; } static inline void ipv6_addr_set(struct in6_addr *addr, __be32 w1, __be32 w2, __be32 w3, __be32 w4) { __ipv6_addr_set_half(&addr->s6_addr32[0], w1, w2); __ipv6_addr_set_half(&addr->s6_addr32[2], w3, w4); } static inline bool ipv6_addr_equal(const struct in6_addr *a1, const struct in6_addr *a2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 const unsigned long *ul1 = (const unsigned long *)a1; const unsigned long *ul2 = (const unsigned long *)a2; return ((ul1[0] ^ ul2[0]) | (ul1[1] ^ ul2[1])) == 0UL; #else return ((a1->s6_addr32[0] ^ a2->s6_addr32[0]) | (a1->s6_addr32[1] ^ a2->s6_addr32[1]) | (a1->s6_addr32[2] ^ a2->s6_addr32[2]) | (a1->s6_addr32[3] ^ a2->s6_addr32[3])) == 0; #endif } #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 static inline bool __ipv6_prefix_equal64_half(const __be64 *a1, const __be64 *a2, unsigned int len) { if (len && ((*a1 ^ *a2) & cpu_to_be64((~0UL) << (64 - len)))) return false; return true; } static inline bool ipv6_prefix_equal(const struct in6_addr *addr1, const struct in6_addr *addr2, unsigned int prefixlen) { const __be64 *a1 = (const __be64 *)addr1; const __be64 *a2 = (const __be64 *)addr2; if (prefixlen >= 64) { if (a1[0] ^ a2[0]) return false; return __ipv6_prefix_equal64_half(a1 + 1, a2 + 1, prefixlen - 64); } return __ipv6_prefix_equal64_half(a1, a2, prefixlen); } #else static inline bool ipv6_prefix_equal(const struct in6_addr *addr1, const struct in6_addr *addr2, unsigned int prefixlen) { const __be32 *a1 = addr1->s6_addr32; const __be32 *a2 = addr2->s6_addr32; unsigned int pdw, pbi; /* check complete u32 in prefix */ pdw = prefixlen >> 5; if (pdw && memcmp(a1, a2, pdw << 2)) return false; /* check incomplete u32 in prefix */ pbi = prefixlen & 0x1f; if (pbi && ((a1[pdw] ^ a2[pdw]) & htonl((0xffffffff) << (32 - pbi)))) return false; return true; } #endif static inline bool ipv6_addr_any(const struct in6_addr *a) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 const unsigned long *ul = (const unsigned long *)a; return (ul[0] | ul[1]) == 0UL; #else return (a->s6_addr32[0] | a->s6_addr32[1] | a->s6_addr32[2] | a->s6_addr32[3]) == 0; #endif } static inline u32 ipv6_addr_hash(const struct in6_addr *a) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 const unsigned long *ul = (const unsigned long *)a; unsigned long x = ul[0] ^ ul[1]; return (u32)(x ^ (x >> 32)); #else return (__force u32)(a->s6_addr32[0] ^ a->s6_addr32[1] ^ a->s6_addr32[2] ^ a->s6_addr32[3]); #endif } /* more secured version of ipv6_addr_hash() */ static inline u32 __ipv6_addr_jhash(const struct in6_addr *a, const u32 initval) { return jhash2((__force const u32 *)a->s6_addr32, ARRAY_SIZE(a->s6_addr32), initval); } static inline bool ipv6_addr_loopback(const struct in6_addr *a) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 const __be64 *be = (const __be64 *)a; return (be[0] | (be[1] ^ cpu_to_be64(1))) == 0UL; #else return (a->s6_addr32[0] | a->s6_addr32[1] | a->s6_addr32[2] | (a->s6_addr32[3] ^ cpu_to_be32(1))) == 0; #endif } /* * Note that we must __force cast these to unsigned long to make sparse happy, * since all of the endian-annotated types are fixed size regardless of arch. */ static inline bool ipv6_addr_v4mapped(const struct in6_addr *a) { return ( #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 *(unsigned long *)a | #else (__force unsigned long)(a->s6_addr32[0] | a->s6_addr32[1]) | #endif (__force unsigned long)(a->s6_addr32[2] ^ cpu_to_be32(0x0000ffff))) == 0UL; } static inline bool ipv6_addr_v4mapped_any(const struct in6_addr *a) { return ipv6_addr_v4mapped(a) && ipv4_is_zeronet(a->s6_addr32[3]); } static inline bool ipv6_addr_v4mapped_loopback(const struct in6_addr *a) { return ipv6_addr_v4mapped(a) && ipv4_is_loopback(a->s6_addr32[3]); } static inline u32 ipv6_portaddr_hash(const struct net *net, const struct in6_addr *addr6, unsigned int port) { unsigned int hash, mix = net_hash_mix(net); if (ipv6_addr_any(addr6)) hash = jhash_1word(0, mix); else if (ipv6_addr_v4mapped(addr6)) hash = jhash_1word((__force u32)addr6->s6_addr32[3], mix); else hash = jhash2((__force u32 *)addr6->s6_addr32, 4, mix); return hash ^ port; } /* * Check for a RFC 4843 ORCHID address * (Overlay Routable Cryptographic Hash Identifiers) */ static inline bool ipv6_addr_orchid(const struct in6_addr *a) { return (a->s6_addr32[0] & htonl(0xfffffff0)) == htonl(0x20010010); } static inline bool ipv6_addr_is_multicast(const struct in6_addr *addr) { return (addr->s6_addr32[0] & htonl(0xFF000000)) == htonl(0xFF000000); } static inline void ipv6_addr_set_v4mapped(const __be32 addr, struct in6_addr *v4mapped) { ipv6_addr_set(v4mapped, 0, 0, htonl(0x0000FFFF), addr); } /* * find the first different bit between two addresses * length of address must be a multiple of 32bits */ static inline int __ipv6_addr_diff32(const void *token1, const void *token2, int addrlen) { const __be32 *a1 = token1, *a2 = token2; int i; addrlen >>= 2; for (i = 0; i < addrlen; i++) { __be32 xb = a1[i] ^ a2[i]; if (xb) return i * 32 + 31 - __fls(ntohl(xb)); } /* * we should *never* get to this point since that * would mean the addrs are equal * * However, we do get to it 8) And exacly, when * addresses are equal 8) * * ip route add 1111::/128 via ... * ip route add 1111::/64 via ... * and we are here. * * Ideally, this function should stop comparison * at prefix length. It does not, but it is still OK, * if returned value is greater than prefix length. * --ANK (980803) */ return addrlen << 5; } #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 static inline int __ipv6_addr_diff64(const void *token1, const void *token2, int addrlen) { const __be64 *a1 = token1, *a2 = token2; int i; addrlen >>= 3; for (i = 0; i < addrlen; i++) { __be64 xb = a1[i] ^ a2[i]; if (xb) return i * 64 + 63 - __fls(be64_to_cpu(xb)); } return addrlen << 6; } #endif static inline int __ipv6_addr_diff(const void *token1, const void *token2, int addrlen) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 if (__builtin_constant_p(addrlen) && !(addrlen & 7)) return __ipv6_addr_diff64(token1, token2, addrlen); #endif return __ipv6_addr_diff32(token1, token2, addrlen); } static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_addr *a2) { return __ipv6_addr_diff(a1, a2, sizeof(struct in6_addr)); } __be32 ipv6_select_ident(struct net *net, const struct in6_addr *daddr, const struct in6_addr *saddr); __be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb); int ip6_dst_hoplimit(struct dst_entry *dst); static inline int ip6_sk_dst_hoplimit(struct ipv6_pinfo *np, struct flowi6 *fl6, struct dst_entry *dst) { int hlimit; if (ipv6_addr_is_multicast(&fl6->daddr)) hlimit = np->mcast_hops; else hlimit = np->hop_limit; if (hlimit < 0) hlimit = ip6_dst_hoplimit(dst); return hlimit; } /* copy IPv6 saddr & daddr to flow_keys, possibly using 64bit load/store * Equivalent to : flow->v6addrs.src = iph->saddr; * flow->v6addrs.dst = iph->daddr; */ static inline void iph_to_flow_copy_v6addrs(struct flow_keys *flow, const struct ipv6hdr *iph) { BUILD_BUG_ON(offsetof(typeof(flow->addrs), v6addrs.dst) != offsetof(typeof(flow->addrs), v6addrs.src) + sizeof(flow->addrs.v6addrs.src)); memcpy(&flow->addrs.v6addrs, &iph->addrs, sizeof(flow->addrs.v6addrs)); flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } #if IS_ENABLED(CONFIG_IPV6) static inline bool ipv6_can_nonlocal_bind(struct net *net, struct inet_sock *inet) { return net->ipv6.sysctl.ip_nonlocal_bind || test_bit(INET_FLAGS_FREEBIND, &inet->inet_flags) || test_bit(INET_FLAGS_TRANSPARENT, &inet->inet_flags); } /* Sysctl settings for net ipv6.auto_flowlabels */ #define IP6_AUTO_FLOW_LABEL_OFF 0 #define IP6_AUTO_FLOW_LABEL_OPTOUT 1 #define IP6_AUTO_FLOW_LABEL_OPTIN 2 #define IP6_AUTO_FLOW_LABEL_FORCED 3 #define IP6_AUTO_FLOW_LABEL_MAX IP6_AUTO_FLOW_LABEL_FORCED #define IP6_DEFAULT_AUTO_FLOW_LABELS IP6_AUTO_FLOW_LABEL_OPTOUT static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb, __be32 flowlabel, bool autolabel, struct flowi6 *fl6) { u32 hash; /* @flowlabel may include more than a flow label, eg, the traffic class. * Here we want only the flow label value. */ flowlabel &= IPV6_FLOWLABEL_MASK; if (flowlabel || net->ipv6.sysctl.auto_flowlabels == IP6_AUTO_FLOW_LABEL_OFF || (!autolabel && net->ipv6.sysctl.auto_flowlabels != IP6_AUTO_FLOW_LABEL_FORCED)) return flowlabel; hash = skb_get_hash_flowi6(skb, fl6); /* Since this is being sent on the wire obfuscate hash a bit * to minimize possbility that any useful information to an * attacker is leaked. Only lower 20 bits are relevant. */ hash = rol32(hash, 16); flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK; if (net->ipv6.sysctl.flowlabel_state_ranges) flowlabel |= IPV6_FLOWLABEL_STATELESS_FLAG; return flowlabel; } static inline int ip6_default_np_autolabel(struct net *net) { switch (net->ipv6.sysctl.auto_flowlabels) { case IP6_AUTO_FLOW_LABEL_OFF: case IP6_AUTO_FLOW_LABEL_OPTIN: default: return 0; case IP6_AUTO_FLOW_LABEL_OPTOUT: case IP6_AUTO_FLOW_LABEL_FORCED: return 1; } } #else static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb, __be32 flowlabel, bool autolabel, struct flowi6 *fl6) { return flowlabel; } static inline int ip6_default_np_autolabel(struct net *net) { return 0; } #endif #if IS_ENABLED(CONFIG_IPV6) static inline int ip6_multipath_hash_policy(const struct net *net) { return net->ipv6.sysctl.multipath_hash_policy; } static inline u32 ip6_multipath_hash_fields(const struct net *net) { return net->ipv6.sysctl.multipath_hash_fields; } #else static inline int ip6_multipath_hash_policy(const struct net *net) { return 0; } static inline u32 ip6_multipath_hash_fields(const struct net *net) { return 0; } #endif /* * Header manipulation */ static inline void ip6_flow_hdr(struct ipv6hdr *hdr, unsigned int tclass, __be32 flowlabel) { *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | flowlabel; } static inline __be32 ip6_flowinfo(const struct ipv6hdr *hdr) { return *(__be32 *)hdr & IPV6_FLOWINFO_MASK; } static inline __be32 ip6_flowlabel(const struct ipv6hdr *hdr) { return *(__be32 *)hdr & IPV6_FLOWLABEL_MASK; } static inline u8 ip6_tclass(__be32 flowinfo) { return ntohl(flowinfo & IPV6_TCLASS_MASK) >> IPV6_TCLASS_SHIFT; } static inline dscp_t ip6_dscp(__be32 flowinfo) { return inet_dsfield_to_dscp(ip6_tclass(flowinfo)); } static inline __be32 ip6_make_flowinfo(unsigned int tclass, __be32 flowlabel) { return htonl(tclass << IPV6_TCLASS_SHIFT) | flowlabel; } static inline __be32 flowi6_get_flowlabel(const struct flowi6 *fl6) { return fl6->flowlabel & IPV6_FLOWLABEL_MASK; } /* * Prototypes exported by ipv6 */ /* * rcv function (called from netdevice level) */ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); void ipv6_list_rcv(struct list_head *head, struct packet_type *pt, struct net_device *orig_dev); int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb); /* * upper-layer output functions */ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority); int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr); int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, size_t length, int transhdrlen, struct ipcm6_cookie *ipc6, struct flowi6 *fl6, struct rt6_info *rt, unsigned int flags); int ip6_push_pending_frames(struct sock *sk); void ip6_flush_pending_frames(struct sock *sk); int ip6_send_skb(struct sk_buff *skb); struct sk_buff *__ip6_make_skb(struct sock *sk, struct sk_buff_head *queue, struct inet_cork_full *cork, struct inet6_cork *v6_cork); struct sk_buff *ip6_make_skb(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, size_t length, int transhdrlen, struct ipcm6_cookie *ipc6, struct rt6_info *rt, unsigned int flags, struct inet_cork_full *cork); static inline struct sk_buff *ip6_finish_skb(struct sock *sk) { return __ip6_make_skb(sk, &sk->sk_write_queue, &inet_sk(sk)->cork, &inet6_sk(sk)->cork); } int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6); struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6, const struct in6_addr *final_dst); struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, const struct in6_addr *final_dst, bool connected); struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb, struct net_device *dev, struct net *net, struct socket *sock, struct in6_addr *saddr, const struct ip_tunnel_info *info, u8 protocol, bool use_cache); struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *orig_dst); /* * skb processing functions */ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb); int ip6_forward(struct sk_buff *skb); int ip6_input(struct sk_buff *skb); int ip6_mc_input(struct sk_buff *skb); void ip6_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int nexthdr, bool have_final); int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb); int ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb); /* * Extension header (options) processing */ void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 *proto, struct in6_addr **daddr_p, struct in6_addr *saddr); void ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 *proto); int ipv6_skip_exthdr(const struct sk_buff *, int start, u8 *nexthdrp, __be16 *frag_offp); bool ipv6_ext_hdr(u8 nexthdr); enum { IP6_FH_F_FRAG = (1 << 0), IP6_FH_F_AUTH = (1 << 1), IP6_FH_F_SKIP_RH = (1 << 2), }; /* find specified header and get offset to it */ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, int target, unsigned short *fragoff, int *fragflg); int ipv6_find_tlv(const struct sk_buff *skb, int offset, int type); struct in6_addr *fl6_update_dst(struct flowi6 *fl6, const struct ipv6_txoptions *opt, struct in6_addr *orig); /* * socket options (ipv6_sockglue.c) */ DECLARE_STATIC_KEY_FALSE(ip6_min_hopcount); int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int do_ipv6_getsockopt(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen); int ipv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); int __ip6_datagram_connect(struct sock *sk, struct sockaddr *addr, int addr_len); int ip6_datagram_connect(struct sock *sk, struct sockaddr *addr, int addr_len); int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *addr, int addr_len); int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr); void ip6_datagram_release_cb(struct sock *sk); int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len); int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len, int *addr_len); void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload); void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info); void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu); void inet6_cleanup_sock(struct sock *sk); void inet6_sock_destruct(struct sock *sk); int inet6_release(struct socket *sock); int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); int inet6_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len); int inet6_getname(struct socket *sock, struct sockaddr *uaddr, int peer); int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int inet6_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int inet6_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk); int inet6_sendmsg(struct socket *sock, struct msghdr *msg, size_t size); int inet6_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags); /* * reassembly.c */ extern const struct proto_ops inet6_stream_ops; extern const struct proto_ops inet6_dgram_ops; extern const struct proto_ops inet6_sockraw_ops; struct group_source_req; struct group_filter; int ip6_mc_source(int add, int omode, struct sock *sk, struct group_source_req *pgsr); int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, struct sockaddr_storage *list); int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, sockptr_t optval, size_t ss_offset); #ifdef CONFIG_PROC_FS int ac6_proc_init(struct net *net); void ac6_proc_exit(struct net *net); int raw6_proc_init(void); void raw6_proc_exit(void); int tcp6_proc_init(struct net *net); void tcp6_proc_exit(struct net *net); int udp6_proc_init(struct net *net); void udp6_proc_exit(struct net *net); int udplite6_proc_init(void); void udplite6_proc_exit(void); int ipv6_misc_proc_init(void); void ipv6_misc_proc_exit(void); int snmp6_register_dev(struct inet6_dev *idev); int snmp6_unregister_dev(struct inet6_dev *idev); #else static inline int ac6_proc_init(struct net *net) { return 0; } static inline void ac6_proc_exit(struct net *net) { } static inline int snmp6_register_dev(struct inet6_dev *idev) { return 0; } static inline int snmp6_unregister_dev(struct inet6_dev *idev) { return 0; } #endif #ifdef CONFIG_SYSCTL struct ctl_table *ipv6_icmp_sysctl_init(struct net *net); size_t ipv6_icmp_sysctl_table_size(void); struct ctl_table *ipv6_route_sysctl_init(struct net *net); size_t ipv6_route_sysctl_table_size(struct net *net); int ipv6_sysctl_register(void); void ipv6_sysctl_unregister(void); #endif int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr); int ipv6_sock_mc_join_ssm(struct sock *sk, int ifindex, const struct in6_addr *addr, unsigned int mode); int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr); static inline int ip6_sock_set_v6only(struct sock *sk) { if (inet_sk(sk)->inet_num) return -EINVAL; lock_sock(sk); sk->sk_ipv6only = true; release_sock(sk); return 0; } static inline void ip6_sock_set_recverr(struct sock *sk) { lock_sock(sk); inet6_sk(sk)->recverr = true; release_sock(sk); } static inline int __ip6_sock_set_addr_preferences(struct sock *sk, int val) { unsigned int pref = 0; unsigned int prefmask = ~0; /* check PUBLIC/TMP/PUBTMP_DEFAULT conflicts */ switch (val & (IPV6_PREFER_SRC_PUBLIC | IPV6_PREFER_SRC_TMP | IPV6_PREFER_SRC_PUBTMP_DEFAULT)) { case IPV6_PREFER_SRC_PUBLIC: pref |= IPV6_PREFER_SRC_PUBLIC; prefmask &= ~(IPV6_PREFER_SRC_PUBLIC | IPV6_PREFER_SRC_TMP); break; case IPV6_PREFER_SRC_TMP: pref |= IPV6_PREFER_SRC_TMP; prefmask &= ~(IPV6_PREFER_SRC_PUBLIC | IPV6_PREFER_SRC_TMP); break; case IPV6_PREFER_SRC_PUBTMP_DEFAULT: prefmask &= ~(IPV6_PREFER_SRC_PUBLIC | IPV6_PREFER_SRC_TMP); break; case 0: break; default: return -EINVAL; } /* check HOME/COA conflicts */ switch (val & (IPV6_PREFER_SRC_HOME | IPV6_PREFER_SRC_COA)) { case IPV6_PREFER_SRC_HOME: prefmask &= ~IPV6_PREFER_SRC_COA; break; case IPV6_PREFER_SRC_COA: pref |= IPV6_PREFER_SRC_COA; break; case 0: break; default: return -EINVAL; } /* check CGA/NONCGA conflicts */ switch (val & (IPV6_PREFER_SRC_CGA|IPV6_PREFER_SRC_NONCGA)) { case IPV6_PREFER_SRC_CGA: case IPV6_PREFER_SRC_NONCGA: case 0: break; default: return -EINVAL; } inet6_sk(sk)->srcprefs = (inet6_sk(sk)->srcprefs & prefmask) | pref; return 0; } static inline int ip6_sock_set_addr_preferences(struct sock *sk, int val) { int ret; lock_sock(sk); ret = __ip6_sock_set_addr_preferences(sk, val); release_sock(sk); return ret; } static inline void ip6_sock_set_recvpktinfo(struct sock *sk) { lock_sock(sk); inet6_sk(sk)->rxopt.bits.rxinfo = true; release_sock(sk); } #endif /* _NET_IPV6_H */ |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * NET Generic infrastructure for Network protocols. * * Definitions for request_sock * * Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * From code originally in include/net/tcp.h */ #ifndef _REQUEST_SOCK_H #define _REQUEST_SOCK_H #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/types.h> #include <linux/bug.h> #include <linux/refcount.h> #include <net/sock.h> struct request_sock; struct sk_buff; struct dst_entry; struct proto; struct request_sock_ops { int family; unsigned int obj_size; struct kmem_cache *slab; char *slab_name; int (*rtx_syn_ack)(const struct sock *sk, struct request_sock *req); void (*send_ack)(const struct sock *sk, struct sk_buff *skb, struct request_sock *req); void (*send_reset)(const struct sock *sk, struct sk_buff *skb); void (*destructor)(struct request_sock *req); void (*syn_ack_timeout)(const struct request_sock *req); }; int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req); struct saved_syn { u32 mac_hdrlen; u32 network_hdrlen; u32 tcp_hdrlen; u8 data[]; }; /* struct request_sock - mini sock to represent a connection request */ struct request_sock { struct sock_common __req_common; #define rsk_refcnt __req_common.skc_refcnt #define rsk_hash __req_common.skc_hash #define rsk_listener __req_common.skc_listener #define rsk_window_clamp __req_common.skc_window_clamp #define rsk_rcv_wnd __req_common.skc_rcv_wnd struct request_sock *dl_next; u16 mss; u8 num_retrans; /* number of retransmits */ u8 syncookie:1; /* syncookie: encode tcpopts in timestamp */ u8 num_timeout:7; /* number of timeouts */ u32 ts_recent; struct timer_list rsk_timer; const struct request_sock_ops *rsk_ops; struct sock *sk; struct saved_syn *saved_syn; u32 secid; u32 peer_secid; u32 timeout; }; static inline struct request_sock *inet_reqsk(const struct sock *sk) { return (struct request_sock *)sk; } static inline struct sock *req_to_sk(struct request_sock *req) { return (struct sock *)req; } static inline struct request_sock * reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener, bool attach_listener) { struct request_sock *req; req = kmem_cache_alloc(ops->slab, GFP_ATOMIC | __GFP_NOWARN); if (!req) return NULL; req->rsk_listener = NULL; if (attach_listener) { if (unlikely(!refcount_inc_not_zero(&sk_listener->sk_refcnt))) { kmem_cache_free(ops->slab, req); return NULL; } req->rsk_listener = sk_listener; } req->rsk_ops = ops; req_to_sk(req)->sk_prot = sk_listener->sk_prot; sk_node_init(&req_to_sk(req)->sk_node); sk_tx_queue_clear(req_to_sk(req)); req->saved_syn = NULL; req->timeout = 0; req->num_timeout = 0; req->num_retrans = 0; req->sk = NULL; refcount_set(&req->rsk_refcnt, 0); return req; } static inline void __reqsk_free(struct request_sock *req) { req->rsk_ops->destructor(req); if (req->rsk_listener) sock_put(req->rsk_listener); kfree(req->saved_syn); kmem_cache_free(req->rsk_ops->slab, req); } static inline void reqsk_free(struct request_sock *req) { WARN_ON_ONCE(refcount_read(&req->rsk_refcnt) != 0); __reqsk_free(req); } static inline void reqsk_put(struct request_sock *req) { if (refcount_dec_and_test(&req->rsk_refcnt)) reqsk_free(req); } /* * For a TCP Fast Open listener - * lock - protects the access to all the reqsk, which is co-owned by * the listener and the child socket. * qlen - pending TFO requests (still in TCP_SYN_RECV). * max_qlen - max TFO reqs allowed before TFO is disabled. * * XXX (TFO) - ideally these fields can be made as part of "listen_sock" * structure above. But there is some implementation difficulty due to * listen_sock being part of request_sock_queue hence will be freed when * a listener is stopped. But TFO related fields may continue to be * accessed even after a listener is closed, until its sk_refcnt drops * to 0 implying no more outstanding TFO reqs. One solution is to keep * listen_opt around until sk_refcnt drops to 0. But there is some other * complexity that needs to be resolved. E.g., a listener can be disabled * temporarily through shutdown()->tcp_disconnect(), and re-enabled later. */ struct fastopen_queue { struct request_sock *rskq_rst_head; /* Keep track of past TFO */ struct request_sock *rskq_rst_tail; /* requests that caused RST. * This is part of the defense * against spoofing attack. */ spinlock_t lock; int qlen; /* # of pending (TCP_SYN_RECV) reqs */ int max_qlen; /* != 0 iff TFO is currently enabled */ struct tcp_fastopen_context __rcu *ctx; /* cipher context for cookie */ }; /** struct request_sock_queue - queue of request_socks * * @rskq_accept_head - FIFO head of established children * @rskq_accept_tail - FIFO tail of established children * @rskq_defer_accept - User waits for some data after accept() * */ struct request_sock_queue { spinlock_t rskq_lock; u8 rskq_defer_accept; u32 synflood_warned; atomic_t qlen; atomic_t young; struct request_sock *rskq_accept_head; struct request_sock *rskq_accept_tail; struct fastopen_queue fastopenq; /* Check max_qlen != 0 to determine * if TFO is enabled. */ }; void reqsk_queue_alloc(struct request_sock_queue *queue); void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, bool reset); static inline bool reqsk_queue_empty(const struct request_sock_queue *queue) { return READ_ONCE(queue->rskq_accept_head) == NULL; } static inline struct request_sock *reqsk_queue_remove(struct request_sock_queue *queue, struct sock *parent) { struct request_sock *req; spin_lock_bh(&queue->rskq_lock); req = queue->rskq_accept_head; if (req) { sk_acceptq_removed(parent); WRITE_ONCE(queue->rskq_accept_head, req->dl_next); if (queue->rskq_accept_head == NULL) queue->rskq_accept_tail = NULL; } spin_unlock_bh(&queue->rskq_lock); return req; } static inline void reqsk_queue_removed(struct request_sock_queue *queue, const struct request_sock *req) { if (req->num_timeout == 0) atomic_dec(&queue->young); atomic_dec(&queue->qlen); } static inline void reqsk_queue_added(struct request_sock_queue *queue) { atomic_inc(&queue->young); atomic_inc(&queue->qlen); } static inline int reqsk_queue_len(const struct request_sock_queue *queue) { return atomic_read(&queue->qlen); } static inline int reqsk_queue_len_young(const struct request_sock_queue *queue) { return atomic_read(&queue->young); } #endif /* _REQUEST_SOCK_H */ |
31 31 31 31 31 14 14 14 14 3 3 3 3 14 14 14 14 14 14 13 25 25 31 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Create default crypto algorithm instances. * * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/internal/aead.h> #include <linux/completion.h> #include <linux/ctype.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kthread.h> #include <linux/module.h> #include <linux/notifier.h> #include <linux/rtnetlink.h> #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/string.h> #include "internal.h" struct cryptomgr_param { struct rtattr *tb[CRYPTO_MAX_ATTRS + 2]; struct { struct rtattr attr; struct crypto_attr_type data; } type; struct { struct rtattr attr; struct crypto_attr_alg data; } attrs[CRYPTO_MAX_ATTRS]; char template[CRYPTO_MAX_ALG_NAME]; struct crypto_larval *larval; u32 otype; u32 omask; }; struct crypto_test_param { char driver[CRYPTO_MAX_ALG_NAME]; char alg[CRYPTO_MAX_ALG_NAME]; u32 type; }; static int cryptomgr_probe(void *data) { struct cryptomgr_param *param = data; struct crypto_template *tmpl; int err; tmpl = crypto_lookup_template(param->template); if (!tmpl) goto out; do { err = tmpl->create(tmpl, param->tb); } while (err == -EAGAIN && !signal_pending(current)); crypto_tmpl_put(tmpl); out: complete_all(¶m->larval->completion); crypto_alg_put(¶m->larval->alg); kfree(param); module_put_and_kthread_exit(0); } static int cryptomgr_schedule_probe(struct crypto_larval *larval) { struct task_struct *thread; struct cryptomgr_param *param; const char *name = larval->alg.cra_name; const char *p; unsigned int len; int i; if (!try_module_get(THIS_MODULE)) goto err; param = kzalloc(sizeof(*param), GFP_KERNEL); if (!param) goto err_put_module; for (p = name; isalnum(*p) || *p == '-' || *p == '_'; p++) ; len = p - name; if (!len || *p != '(') goto err_free_param; memcpy(param->template, name, len); i = 0; for (;;) { name = ++p; for (; isalnum(*p) || *p == '-' || *p == '_'; p++) ; if (*p == '(') { int recursion = 0; for (;;) { if (!*++p) goto err_free_param; if (*p == '(') recursion++; else if (*p == ')' && !recursion--) break; } p++; } len = p - name; if (!len) goto err_free_param; param->attrs[i].attr.rta_len = sizeof(param->attrs[i]); param->attrs[i].attr.rta_type = CRYPTOA_ALG; memcpy(param->attrs[i].data.name, name, len); param->tb[i + 1] = ¶m->attrs[i].attr; i++; if (i >= CRYPTO_MAX_ATTRS) goto err_free_param; if (*p == ')') break; if (*p != ',') goto err_free_param; } if (!i) goto err_free_param; param->tb[i + 1] = NULL; param->type.attr.rta_len = sizeof(param->type); param->type.attr.rta_type = CRYPTOA_TYPE; param->type.data.type = larval->alg.cra_flags & ~CRYPTO_ALG_TESTED; param->type.data.mask = larval->mask & ~CRYPTO_ALG_TESTED; param->tb[0] = ¶m->type.attr; param->otype = larval->alg.cra_flags; param->omask = larval->mask; crypto_alg_get(&larval->alg); param->larval = larval; thread = kthread_run(cryptomgr_probe, param, "cryptomgr_probe"); if (IS_ERR(thread)) goto err_put_larval; return NOTIFY_STOP; err_put_larval: crypto_alg_put(&larval->alg); err_free_param: kfree(param); err_put_module: module_put(THIS_MODULE); err: return NOTIFY_OK; } static int cryptomgr_test(void *data) { struct crypto_test_param *param = data; u32 type = param->type; int err; err = alg_test(param->driver, param->alg, type, CRYPTO_ALG_TESTED); crypto_alg_tested(param->driver, err); kfree(param); module_put_and_kthread_exit(0); } static int cryptomgr_schedule_test(struct crypto_alg *alg) { struct task_struct *thread; struct crypto_test_param *param; if (IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS)) return NOTIFY_DONE; if (!try_module_get(THIS_MODULE)) goto err; param = kzalloc(sizeof(*param), GFP_KERNEL); if (!param) goto err_put_module; memcpy(param->driver, alg->cra_driver_name, sizeof(param->driver)); memcpy(param->alg, alg->cra_name, sizeof(param->alg)); param->type = alg->cra_flags; thread = kthread_run(cryptomgr_test, param, "cryptomgr_test"); if (IS_ERR(thread)) goto err_free_param; return NOTIFY_STOP; err_free_param: kfree(param); err_put_module: module_put(THIS_MODULE); err: return NOTIFY_OK; } static int cryptomgr_notify(struct notifier_block *this, unsigned long msg, void *data) { switch (msg) { case CRYPTO_MSG_ALG_REQUEST: return cryptomgr_schedule_probe(data); case CRYPTO_MSG_ALG_REGISTER: return cryptomgr_schedule_test(data); case CRYPTO_MSG_ALG_LOADED: break; } return NOTIFY_DONE; } static struct notifier_block cryptomgr_notifier = { .notifier_call = cryptomgr_notify, }; static int __init cryptomgr_init(void) { return crypto_register_notifier(&cryptomgr_notifier); } static void __exit cryptomgr_exit(void) { int err = crypto_unregister_notifier(&cryptomgr_notifier); BUG_ON(err); } /* * This is arch_initcall() so that the crypto self-tests are run on algorithms * registered early by subsys_initcall(). subsys_initcall() is needed for * generic implementations so that they're available for comparison tests when * other implementations are registered later by module_init(). */ arch_initcall(cryptomgr_init); module_exit(cryptomgr_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Crypto Algorithm Manager"); |
39 41 403 89 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NDISC_H #define _NDISC_H #include <net/ipv6_stubs.h> /* * ICMP codes for neighbour discovery messages */ #define NDISC_ROUTER_SOLICITATION 133 #define NDISC_ROUTER_ADVERTISEMENT 134 #define NDISC_NEIGHBOUR_SOLICITATION 135 #define NDISC_NEIGHBOUR_ADVERTISEMENT 136 #define NDISC_REDIRECT 137 /* * Router type: cross-layer information from link-layer to * IPv6 layer reported by certain link types (e.g., RFC4214). */ #define NDISC_NODETYPE_UNSPEC 0 /* unspecified (default) */ #define NDISC_NODETYPE_HOST 1 /* host or unauthorized router */ #define NDISC_NODETYPE_NODEFAULT 2 /* non-default router */ #define NDISC_NODETYPE_DEFAULT 3 /* default router */ /* * ndisc options */ enum { __ND_OPT_PREFIX_INFO_END = 0, ND_OPT_SOURCE_LL_ADDR = 1, /* RFC2461 */ ND_OPT_TARGET_LL_ADDR = 2, /* RFC2461 */ ND_OPT_PREFIX_INFO = 3, /* RFC2461 */ ND_OPT_REDIRECT_HDR = 4, /* RFC2461 */ ND_OPT_MTU = 5, /* RFC2461 */ ND_OPT_NONCE = 14, /* RFC7527 */ __ND_OPT_ARRAY_MAX, ND_OPT_ROUTE_INFO = 24, /* RFC4191 */ ND_OPT_RDNSS = 25, /* RFC5006 */ ND_OPT_DNSSL = 31, /* RFC6106 */ ND_OPT_6CO = 34, /* RFC6775 */ ND_OPT_CAPTIVE_PORTAL = 37, /* RFC7710 */ ND_OPT_PREF64 = 38, /* RFC8781 */ __ND_OPT_MAX }; #define MAX_RTR_SOLICITATION_DELAY HZ #define ND_REACHABLE_TIME (30*HZ) #define ND_RETRANS_TIMER HZ #include <linux/compiler.h> #include <linux/icmpv6.h> #include <linux/in6.h> #include <linux/types.h> #include <linux/if_arp.h> #include <linux/netdevice.h> #include <linux/hash.h> #include <net/neighbour.h> /* Set to 3 to get tracing... */ #define ND_DEBUG 1 #define ND_PRINTK(val, level, fmt, ...) \ do { \ if (val <= ND_DEBUG) \ net_##level##_ratelimited(fmt, ##__VA_ARGS__); \ } while (0) struct ctl_table; struct inet6_dev; struct net_device; struct net_proto_family; struct sk_buff; struct prefix_info; extern struct neigh_table nd_tbl; struct nd_msg { struct icmp6hdr icmph; struct in6_addr target; __u8 opt[]; }; struct rs_msg { struct icmp6hdr icmph; __u8 opt[]; }; struct ra_msg { struct icmp6hdr icmph; __be32 reachable_time; __be32 retrans_timer; }; struct rd_msg { struct icmp6hdr icmph; struct in6_addr target; struct in6_addr dest; __u8 opt[]; }; struct nd_opt_hdr { __u8 nd_opt_type; __u8 nd_opt_len; } __packed; /* ND options */ struct ndisc_options { struct nd_opt_hdr *nd_opt_array[__ND_OPT_ARRAY_MAX]; #ifdef CONFIG_IPV6_ROUTE_INFO struct nd_opt_hdr *nd_opts_ri; struct nd_opt_hdr *nd_opts_ri_end; #endif struct nd_opt_hdr *nd_useropts; struct nd_opt_hdr *nd_useropts_end; #if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN) struct nd_opt_hdr *nd_802154_opt_array[ND_OPT_TARGET_LL_ADDR + 1]; #endif }; #define nd_opts_src_lladdr nd_opt_array[ND_OPT_SOURCE_LL_ADDR] #define nd_opts_tgt_lladdr nd_opt_array[ND_OPT_TARGET_LL_ADDR] #define nd_opts_pi nd_opt_array[ND_OPT_PREFIX_INFO] #define nd_opts_pi_end nd_opt_array[__ND_OPT_PREFIX_INFO_END] #define nd_opts_rh nd_opt_array[ND_OPT_REDIRECT_HDR] #define nd_opts_mtu nd_opt_array[ND_OPT_MTU] #define nd_opts_nonce nd_opt_array[ND_OPT_NONCE] #define nd_802154_opts_src_lladdr nd_802154_opt_array[ND_OPT_SOURCE_LL_ADDR] #define nd_802154_opts_tgt_lladdr nd_802154_opt_array[ND_OPT_TARGET_LL_ADDR] #define NDISC_OPT_SPACE(len) (((len)+2+7)&~7) struct ndisc_options *ndisc_parse_options(const struct net_device *dev, u8 *opt, int opt_len, struct ndisc_options *ndopts); void __ndisc_fill_addr_option(struct sk_buff *skb, int type, const void *data, int data_len, int pad); #define NDISC_OPS_REDIRECT_DATA_SPACE 2 /* * This structure defines the hooks for IPv6 neighbour discovery. * The following hooks can be defined; unless noted otherwise, they are * optional and can be filled with a null pointer. * * int (*is_useropt)(u8 nd_opt_type): * This function is called when IPv6 decide RA userspace options. if * this function returns 1 then the option given by nd_opt_type will * be handled as userspace option additional to the IPv6 options. * * int (*parse_options)(const struct net_device *dev, * struct nd_opt_hdr *nd_opt, * struct ndisc_options *ndopts): * This function is called while parsing ndisc ops and put each position * as pointer into ndopts. If this function return unequal 0, then this * function took care about the ndisc option, if 0 then the IPv6 ndisc * option parser will take care about that option. * * void (*update)(const struct net_device *dev, struct neighbour *n, * u32 flags, u8 icmp6_type, * const struct ndisc_options *ndopts): * This function is called when IPv6 ndisc updates the neighbour cache * entry. Additional options which can be updated may be previously * parsed by parse_opts callback and accessible over ndopts parameter. * * int (*opt_addr_space)(const struct net_device *dev, u8 icmp6_type, * struct neighbour *neigh, u8 *ha_buf, * u8 **ha): * This function is called when the necessary option space will be * calculated before allocating a skb. The parameters neigh, ha_buf * abd ha are available on NDISC_REDIRECT messages only. * * void (*fill_addr_option)(const struct net_device *dev, * struct sk_buff *skb, u8 icmp6_type, * const u8 *ha): * This function is called when the skb will finally fill the option * fields inside skb. NOTE: this callback should fill the option * fields to the skb which are previously indicated by opt_space * parameter. That means the decision to add such option should * not lost between these two callbacks, e.g. protected by interface * up state. * * void (*prefix_rcv_add_addr)(struct net *net, struct net_device *dev, * const struct prefix_info *pinfo, * struct inet6_dev *in6_dev, * struct in6_addr *addr, * int addr_type, u32 addr_flags, * bool sllao, bool tokenized, * __u32 valid_lft, u32 prefered_lft, * bool dev_addr_generated): * This function is called when a RA messages is received with valid * PIO option fields and an IPv6 address will be added to the interface * for autoconfiguration. The parameter dev_addr_generated reports about * if the address was based on dev->dev_addr or not. This can be used * to add a second address if link-layer operates with two link layer * addresses. E.g. 802.15.4 6LoWPAN. */ struct ndisc_ops { int (*is_useropt)(u8 nd_opt_type); int (*parse_options)(const struct net_device *dev, struct nd_opt_hdr *nd_opt, struct ndisc_options *ndopts); void (*update)(const struct net_device *dev, struct neighbour *n, u32 flags, u8 icmp6_type, const struct ndisc_options *ndopts); int (*opt_addr_space)(const struct net_device *dev, u8 icmp6_type, struct neighbour *neigh, u8 *ha_buf, u8 **ha); void (*fill_addr_option)(const struct net_device *dev, struct sk_buff *skb, u8 icmp6_type, const u8 *ha); void (*prefix_rcv_add_addr)(struct net *net, struct net_device *dev, const struct prefix_info *pinfo, struct inet6_dev *in6_dev, struct in6_addr *addr, int addr_type, u32 addr_flags, bool sllao, bool tokenized, __u32 valid_lft, u32 prefered_lft, bool dev_addr_generated); }; #if IS_ENABLED(CONFIG_IPV6) static inline int ndisc_ops_is_useropt(const struct net_device *dev, u8 nd_opt_type) { if (dev->ndisc_ops && dev->ndisc_ops->is_useropt) return dev->ndisc_ops->is_useropt(nd_opt_type); else return 0; } static inline int ndisc_ops_parse_options(const struct net_device *dev, struct nd_opt_hdr *nd_opt, struct ndisc_options *ndopts) { if (dev->ndisc_ops && dev->ndisc_ops->parse_options) return dev->ndisc_ops->parse_options(dev, nd_opt, ndopts); else return 0; } static inline void ndisc_ops_update(const struct net_device *dev, struct neighbour *n, u32 flags, u8 icmp6_type, const struct ndisc_options *ndopts) { if (dev->ndisc_ops && dev->ndisc_ops->update) dev->ndisc_ops->update(dev, n, flags, icmp6_type, ndopts); } static inline int ndisc_ops_opt_addr_space(const struct net_device *dev, u8 icmp6_type) { if (dev->ndisc_ops && dev->ndisc_ops->opt_addr_space && icmp6_type != NDISC_REDIRECT) return dev->ndisc_ops->opt_addr_space(dev, icmp6_type, NULL, NULL, NULL); else return 0; } static inline int ndisc_ops_redirect_opt_addr_space(const struct net_device *dev, struct neighbour *neigh, u8 *ha_buf, u8 **ha) { if (dev->ndisc_ops && dev->ndisc_ops->opt_addr_space) return dev->ndisc_ops->opt_addr_space(dev, NDISC_REDIRECT, neigh, ha_buf, ha); else return 0; } static inline void ndisc_ops_fill_addr_option(const struct net_device *dev, struct sk_buff *skb, u8 icmp6_type) { if (dev->ndisc_ops && dev->ndisc_ops->fill_addr_option && icmp6_type != NDISC_REDIRECT) dev->ndisc_ops->fill_addr_option(dev, skb, icmp6_type, NULL); } static inline void ndisc_ops_fill_redirect_addr_option(const struct net_device *dev, struct sk_buff *skb, const u8 *ha) { if (dev->ndisc_ops && dev->ndisc_ops->fill_addr_option) dev->ndisc_ops->fill_addr_option(dev, skb, NDISC_REDIRECT, ha); } static inline void ndisc_ops_prefix_rcv_add_addr(struct net *net, struct net_device *dev, const struct prefix_info *pinfo, struct inet6_dev *in6_dev, struct in6_addr *addr, int addr_type, u32 addr_flags, bool sllao, bool tokenized, __u32 valid_lft, u32 prefered_lft, bool dev_addr_generated) { if (dev->ndisc_ops && dev->ndisc_ops->prefix_rcv_add_addr) dev->ndisc_ops->prefix_rcv_add_addr(net, dev, pinfo, in6_dev, addr, addr_type, addr_flags, sllao, tokenized, valid_lft, prefered_lft, dev_addr_generated); } #endif /* * Return the padding between the option length and the start of the * link addr. Currently only IP-over-InfiniBand needs this, although * if RFC 3831 IPv6-over-Fibre Channel is ever implemented it may * also need a pad of 2. */ static inline int ndisc_addr_option_pad(unsigned short type) { switch (type) { case ARPHRD_INFINIBAND: return 2; default: return 0; } } static inline int __ndisc_opt_addr_space(unsigned char addr_len, int pad) { return NDISC_OPT_SPACE(addr_len + pad); } #if IS_ENABLED(CONFIG_IPV6) static inline int ndisc_opt_addr_space(struct net_device *dev, u8 icmp6_type) { return __ndisc_opt_addr_space(dev->addr_len, ndisc_addr_option_pad(dev->type)) + ndisc_ops_opt_addr_space(dev, icmp6_type); } static inline int ndisc_redirect_opt_addr_space(struct net_device *dev, struct neighbour *neigh, u8 *ops_data_buf, u8 **ops_data) { return __ndisc_opt_addr_space(dev->addr_len, ndisc_addr_option_pad(dev->type)) + ndisc_ops_redirect_opt_addr_space(dev, neigh, ops_data_buf, ops_data); } #endif static inline u8 *__ndisc_opt_addr_data(struct nd_opt_hdr *p, unsigned char addr_len, int prepad) { u8 *lladdr = (u8 *)(p + 1); int lladdrlen = p->nd_opt_len << 3; if (lladdrlen != __ndisc_opt_addr_space(addr_len, prepad)) return NULL; return lladdr + prepad; } static inline u8 *ndisc_opt_addr_data(struct nd_opt_hdr *p, struct net_device *dev) { return __ndisc_opt_addr_data(p, dev->addr_len, ndisc_addr_option_pad(dev->type)); } static inline u32 ndisc_hashfn(const void *pkey, const struct net_device *dev, __u32 *hash_rnd) { const u32 *p32 = pkey; return (((p32[0] ^ hash32_ptr(dev)) * hash_rnd[0]) + (p32[1] * hash_rnd[1]) + (p32[2] * hash_rnd[2]) + (p32[3] * hash_rnd[3])); } static inline struct neighbour *__ipv6_neigh_lookup_noref(struct net_device *dev, const void *pkey) { return ___neigh_lookup_noref(&nd_tbl, neigh_key_eq128, ndisc_hashfn, pkey, dev); } static inline struct neighbour *__ipv6_neigh_lookup_noref_stub(struct net_device *dev, const void *pkey) { return ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128, ndisc_hashfn, pkey, dev); } static inline struct neighbour *__ipv6_neigh_lookup(struct net_device *dev, const void *pkey) { struct neighbour *n; rcu_read_lock(); n = __ipv6_neigh_lookup_noref(dev, pkey); if (n && !refcount_inc_not_zero(&n->refcnt)) n = NULL; rcu_read_unlock(); return n; } static inline void __ipv6_confirm_neigh(struct net_device *dev, const void *pkey) { struct neighbour *n; rcu_read_lock(); n = __ipv6_neigh_lookup_noref(dev, pkey); neigh_confirm(n); rcu_read_unlock(); } static inline void __ipv6_confirm_neigh_stub(struct net_device *dev, const void *pkey) { struct neighbour *n; rcu_read_lock(); n = __ipv6_neigh_lookup_noref_stub(dev, pkey); neigh_confirm(n); rcu_read_unlock(); } /* uses ipv6_stub and is meant for use outside of IPv6 core */ static inline struct neighbour *ip_neigh_gw6(struct net_device *dev, const void *addr) { struct neighbour *neigh; neigh = __ipv6_neigh_lookup_noref_stub(dev, addr); if (unlikely(!neigh)) neigh = __neigh_create(ipv6_stub->nd_tbl, addr, dev, false); return neigh; } int ndisc_init(void); int ndisc_late_init(void); void ndisc_late_cleanup(void); void ndisc_cleanup(void); enum skb_drop_reason ndisc_rcv(struct sk_buff *skb); struct sk_buff *ndisc_ns_create(struct net_device *dev, const struct in6_addr *solicit, const struct in6_addr *saddr, u64 nonce); void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, const struct in6_addr *daddr, const struct in6_addr *saddr, u64 nonce); void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, const struct in6_addr *saddr); void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr, const struct in6_addr *daddr); void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr, const struct in6_addr *solicited_addr, bool router, bool solicited, bool override, bool inc_opt); void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target); int ndisc_mc_map(const struct in6_addr *addr, char *buf, struct net_device *dev, int dir); void ndisc_update(const struct net_device *dev, struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u8 icmp6_type, struct ndisc_options *ndopts); /* * IGMP */ int igmp6_init(void); int igmp6_late_init(void); void igmp6_cleanup(void); void igmp6_late_cleanup(void); void igmp6_event_query(struct sk_buff *skb); void igmp6_event_report(struct sk_buff *skb); #ifdef CONFIG_SYSCTL int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); #endif void inet6_ifinfo_notify(int event, struct inet6_dev *idev); #endif |
1061 125 1061 1061 32 32 30 31 31 32 32 32 32 32 32 32 32 32 31 31 6 31 32 32 32 31 1110 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 | // SPDX-License-Identifier: GPL-2.0 /* * drivers/base/power/runtime.c - Helper functions for device runtime PM * * Copyright (c) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. * Copyright (C) 2010 Alan Stern <stern@rowland.harvard.edu> */ #include <linux/sched/mm.h> #include <linux/ktime.h> #include <linux/hrtimer.h> #include <linux/export.h> #include <linux/pm_runtime.h> #include <linux/pm_wakeirq.h> #include <trace/events/rpm.h> #include "../base.h" #include "power.h" typedef int (*pm_callback_t)(struct device *); static pm_callback_t __rpm_get_callback(struct device *dev, size_t cb_offset) { pm_callback_t cb; const struct dev_pm_ops *ops; if (dev->pm_domain) ops = &dev->pm_domain->ops; else if (dev->type && dev->type->pm) ops = dev->type->pm; else if (dev->class && dev->class->pm) ops = dev->class->pm; else if (dev->bus && dev->bus->pm) ops = dev->bus->pm; else ops = NULL; if (ops) cb = *(pm_callback_t *)((void *)ops + cb_offset); else cb = NULL; if (!cb && dev->driver && dev->driver->pm) cb = *(pm_callback_t *)((void *)dev->driver->pm + cb_offset); return cb; } #define RPM_GET_CALLBACK(dev, callback) \ __rpm_get_callback(dev, offsetof(struct dev_pm_ops, callback)) static int rpm_resume(struct device *dev, int rpmflags); static int rpm_suspend(struct device *dev, int rpmflags); /** * update_pm_runtime_accounting - Update the time accounting of power states * @dev: Device to update the accounting for * * In order to be able to have time accounting of the various power states * (as used by programs such as PowerTOP to show the effectiveness of runtime * PM), we need to track the time spent in each state. * update_pm_runtime_accounting must be called each time before the * runtime_status field is updated, to account the time in the old state * correctly. */ static void update_pm_runtime_accounting(struct device *dev) { u64 now, last, delta; if (dev->power.disable_depth > 0) return; last = dev->power.accounting_timestamp; now = ktime_get_mono_fast_ns(); dev->power.accounting_timestamp = now; /* * Because ktime_get_mono_fast_ns() is not monotonic during * timekeeping updates, ensure that 'now' is after the last saved * timesptamp. */ if (now < last) return; delta = now - last; if (dev->power.runtime_status == RPM_SUSPENDED) dev->power.suspended_time += delta; else dev->power.active_time += delta; } static void __update_runtime_status(struct device *dev, enum rpm_status status) { update_pm_runtime_accounting(dev); dev->power.runtime_status = status; } static u64 rpm_get_accounted_time(struct device *dev, bool suspended) { u64 time; unsigned long flags; spin_lock_irqsave(&dev->power.lock, flags); update_pm_runtime_accounting(dev); time = suspended ? dev->power.suspended_time : dev->power.active_time; spin_unlock_irqrestore(&dev->power.lock, flags); return time; } u64 pm_runtime_active_time(struct device *dev) { return rpm_get_accounted_time(dev, false); } u64 pm_runtime_suspended_time(struct device *dev) { return rpm_get_accounted_time(dev, true); } EXPORT_SYMBOL_GPL(pm_runtime_suspended_time); /** * pm_runtime_deactivate_timer - Deactivate given device's suspend timer. * @dev: Device to handle. */ static void pm_runtime_deactivate_timer(struct device *dev) { if (dev->power.timer_expires > 0) { hrtimer_try_to_cancel(&dev->power.suspend_timer); dev->power.timer_expires = 0; } } /** * pm_runtime_cancel_pending - Deactivate suspend timer and cancel requests. * @dev: Device to handle. */ static void pm_runtime_cancel_pending(struct device *dev) { pm_runtime_deactivate_timer(dev); /* * In case there's a request pending, make sure its work function will * return without doing anything. */ dev->power.request = RPM_REQ_NONE; } /* * pm_runtime_autosuspend_expiration - Get a device's autosuspend-delay expiration time. * @dev: Device to handle. * * Compute the autosuspend-delay expiration time based on the device's * power.last_busy time. If the delay has already expired or is disabled * (negative) or the power.use_autosuspend flag isn't set, return 0. * Otherwise return the expiration time in nanoseconds (adjusted to be nonzero). * * This function may be called either with or without dev->power.lock held. * Either way it can be racy, since power.last_busy may be updated at any time. */ u64 pm_runtime_autosuspend_expiration(struct device *dev) { int autosuspend_delay; u64 expires; if (!dev->power.use_autosuspend) return 0; autosuspend_delay = READ_ONCE(dev->power.autosuspend_delay); if (autosuspend_delay < 0) return 0; expires = READ_ONCE(dev->power.last_busy); expires += (u64)autosuspend_delay * NSEC_PER_MSEC; if (expires > ktime_get_mono_fast_ns()) return expires; /* Expires in the future */ return 0; } EXPORT_SYMBOL_GPL(pm_runtime_autosuspend_expiration); static int dev_memalloc_noio(struct device *dev, void *data) { return dev->power.memalloc_noio; } /* * pm_runtime_set_memalloc_noio - Set a device's memalloc_noio flag. * @dev: Device to handle. * @enable: True for setting the flag and False for clearing the flag. * * Set the flag for all devices in the path from the device to the * root device in the device tree if @enable is true, otherwise clear * the flag for devices in the path whose siblings don't set the flag. * * The function should only be called by block device, or network * device driver for solving the deadlock problem during runtime * resume/suspend: * * If memory allocation with GFP_KERNEL is called inside runtime * resume/suspend callback of any one of its ancestors(or the * block device itself), the deadlock may be triggered inside the * memory allocation since it might not complete until the block * device becomes active and the involed page I/O finishes. The * situation is pointed out first by Alan Stern. Network device * are involved in iSCSI kind of situation. * * The lock of dev_hotplug_mutex is held in the function for handling * hotplug race because pm_runtime_set_memalloc_noio() may be called * in async probe(). * * The function should be called between device_add() and device_del() * on the affected device(block/network device). */ void pm_runtime_set_memalloc_noio(struct device *dev, bool enable) { static DEFINE_MUTEX(dev_hotplug_mutex); mutex_lock(&dev_hotplug_mutex); for (;;) { bool enabled; /* hold power lock since bitfield is not SMP-safe. */ spin_lock_irq(&dev->power.lock); enabled = dev->power.memalloc_noio; dev->power.memalloc_noio = enable; spin_unlock_irq(&dev->power.lock); /* * not need to enable ancestors any more if the device * has been enabled. */ if (enabled && enable) break; dev = dev->parent; /* * clear flag of the parent device only if all the * children don't set the flag because ancestor's * flag was set by any one of the descendants. */ if (!dev || (!enable && device_for_each_child(dev, NULL, dev_memalloc_noio))) break; } mutex_unlock(&dev_hotplug_mutex); } EXPORT_SYMBOL_GPL(pm_runtime_set_memalloc_noio); /** * rpm_check_suspend_allowed - Test whether a device may be suspended. * @dev: Device to test. */ static int rpm_check_suspend_allowed(struct device *dev) { int retval = 0; if (dev->power.runtime_error) retval = -EINVAL; else if (dev->power.disable_depth > 0) retval = -EACCES; else if (atomic_read(&dev->power.usage_count)) retval = -EAGAIN; else if (!dev->power.ignore_children && atomic_read(&dev->power.child_count)) retval = -EBUSY; /* Pending resume requests take precedence over suspends. */ else if ((dev->power.deferred_resume && dev->power.runtime_status == RPM_SUSPENDING) || (dev->power.request_pending && dev->power.request == RPM_REQ_RESUME)) retval = -EAGAIN; else if (__dev_pm_qos_resume_latency(dev) == 0) retval = -EPERM; else if (dev->power.runtime_status == RPM_SUSPENDED) retval = 1; return retval; } static int rpm_get_suppliers(struct device *dev) { struct device_link *link; list_for_each_entry_rcu(link, &dev->links.suppliers, c_node, device_links_read_lock_held()) { int retval; if (!(link->flags & DL_FLAG_PM_RUNTIME)) continue; retval = pm_runtime_get_sync(link->supplier); /* Ignore suppliers with disabled runtime PM. */ if (retval < 0 && retval != -EACCES) { pm_runtime_put_noidle(link->supplier); return retval; } refcount_inc(&link->rpm_active); } return 0; } /** * pm_runtime_release_supplier - Drop references to device link's supplier. * @link: Target device link. * * Drop all runtime PM references associated with @link to its supplier device. */ void pm_runtime_release_supplier(struct device_link *link) { struct device *supplier = link->supplier; /* * The additional power.usage_count check is a safety net in case * the rpm_active refcount becomes saturated, in which case * refcount_dec_not_one() would return true forever, but it is not * strictly necessary. */ while (refcount_dec_not_one(&link->rpm_active) && atomic_read(&supplier->power.usage_count) > 0) pm_runtime_put_noidle(supplier); } static void __rpm_put_suppliers(struct device *dev, bool try_to_suspend) { struct device_link *link; list_for_each_entry_rcu(link, &dev->links.suppliers, c_node, device_links_read_lock_held()) { pm_runtime_release_supplier(link); if (try_to_suspend) pm_request_idle(link->supplier); } } static void rpm_put_suppliers(struct device *dev) { __rpm_put_suppliers(dev, true); } static void rpm_suspend_suppliers(struct device *dev) { struct device_link *link; int idx = device_links_read_lock(); list_for_each_entry_rcu(link, &dev->links.suppliers, c_node, device_links_read_lock_held()) pm_request_idle(link->supplier); device_links_read_unlock(idx); } /** * __rpm_callback - Run a given runtime PM callback for a given device. * @cb: Runtime PM callback to run. * @dev: Device to run the callback for. */ static int __rpm_callback(int (*cb)(struct device *), struct device *dev) __releases(&dev->power.lock) __acquires(&dev->power.lock) { int retval = 0, idx; bool use_links = dev->power.links_count > 0; if (dev->power.irq_safe) { spin_unlock(&dev->power.lock); } else { spin_unlock_irq(&dev->power.lock); /* * Resume suppliers if necessary. * * The device's runtime PM status cannot change until this * routine returns, so it is safe to read the status outside of * the lock. */ if (use_links && dev->power.runtime_status == RPM_RESUMING) { idx = device_links_read_lock(); retval = rpm_get_suppliers(dev); if (retval) { rpm_put_suppliers(dev); goto fail; } device_links_read_unlock(idx); } } if (cb) retval = cb(dev); if (dev->power.irq_safe) { spin_lock(&dev->power.lock); } else { /* * If the device is suspending and the callback has returned * success, drop the usage counters of the suppliers that have * been reference counted on its resume. * * Do that if resume fails too. */ if (use_links && ((dev->power.runtime_status == RPM_SUSPENDING && !retval) || (dev->power.runtime_status == RPM_RESUMING && retval))) { idx = device_links_read_lock(); __rpm_put_suppliers(dev, false); fail: device_links_read_unlock(idx); } spin_lock_irq(&dev->power.lock); } return retval; } /** * rpm_callback - Run a given runtime PM callback for a given device. * @cb: Runtime PM callback to run. * @dev: Device to run the callback for. */ static int rpm_callback(int (*cb)(struct device *), struct device *dev) { int retval; if (dev->power.memalloc_noio) { unsigned int noio_flag; /* * Deadlock might be caused if memory allocation with * GFP_KERNEL happens inside runtime_suspend and * runtime_resume callbacks of one block device's * ancestor or the block device itself. Network * device might be thought as part of iSCSI block * device, so network device and its ancestor should * be marked as memalloc_noio too. */ noio_flag = memalloc_noio_save(); retval = __rpm_callback(cb, dev); memalloc_noio_restore(noio_flag); } else { retval = __rpm_callback(cb, dev); } dev->power.runtime_error = retval; return retval != -EACCES ? retval : -EIO; } /** * rpm_idle - Notify device bus type if the device can be suspended. * @dev: Device to notify the bus type about. * @rpmflags: Flag bits. * * Check if the device's runtime PM status allows it to be suspended. If * another idle notification has been started earlier, return immediately. If * the RPM_ASYNC flag is set then queue an idle-notification request; otherwise * run the ->runtime_idle() callback directly. If the ->runtime_idle callback * doesn't exist or if it returns 0, call rpm_suspend with the RPM_AUTO flag. * * This function must be called under dev->power.lock with interrupts disabled. */ static int rpm_idle(struct device *dev, int rpmflags) { int (*callback)(struct device *); int retval; trace_rpm_idle(dev, rpmflags); retval = rpm_check_suspend_allowed(dev); if (retval < 0) ; /* Conditions are wrong. */ /* Idle notifications are allowed only in the RPM_ACTIVE state. */ else if (dev->power.runtime_status != RPM_ACTIVE) retval = -EAGAIN; /* * Any pending request other than an idle notification takes * precedence over us, except that the timer may be running. */ else if (dev->power.request_pending && dev->power.request > RPM_REQ_IDLE) retval = -EAGAIN; /* Act as though RPM_NOWAIT is always set. */ else if (dev->power.idle_notification) retval = -EINPROGRESS; if (retval) goto out; /* Pending requests need to be canceled. */ dev->power.request = RPM_REQ_NONE; callback = RPM_GET_CALLBACK(dev, runtime_idle); /* If no callback assume success. */ if (!callback || dev->power.no_callbacks) goto out; /* Carry out an asynchronous or a synchronous idle notification. */ if (rpmflags & RPM_ASYNC) { dev->power.request = RPM_REQ_IDLE; if (!dev->power.request_pending) { dev->power.request_pending = true; queue_work(pm_wq, &dev->power.work); } trace_rpm_return_int(dev, _THIS_IP_, 0); return 0; } dev->power.idle_notification = true; if (dev->power.irq_safe) spin_unlock(&dev->power.lock); else spin_unlock_irq(&dev->power.lock); retval = callback(dev); if (dev->power.irq_safe) spin_lock(&dev->power.lock); else spin_lock_irq(&dev->power.lock); dev->power.idle_notification = false; wake_up_all(&dev->power.wait_queue); out: trace_rpm_return_int(dev, _THIS_IP_, retval); return retval ? retval : rpm_suspend(dev, rpmflags | RPM_AUTO); } /** * rpm_suspend - Carry out runtime suspend of given device. * @dev: Device to suspend. * @rpmflags: Flag bits. * * Check if the device's runtime PM status allows it to be suspended. * Cancel a pending idle notification, autosuspend or suspend. If * another suspend has been started earlier, either return immediately * or wait for it to finish, depending on the RPM_NOWAIT and RPM_ASYNC * flags. If the RPM_ASYNC flag is set then queue a suspend request; * otherwise run the ->runtime_suspend() callback directly. When * ->runtime_suspend succeeded, if a deferred resume was requested while * the callback was running then carry it out, otherwise send an idle * notification for its parent (if the suspend succeeded and both * ignore_children of parent->power and irq_safe of dev->power are not set). * If ->runtime_suspend failed with -EAGAIN or -EBUSY, and if the RPM_AUTO * flag is set and the next autosuspend-delay expiration time is in the * future, schedule another autosuspend attempt. * * This function must be called under dev->power.lock with interrupts disabled. */ static int rpm_suspend(struct device *dev, int rpmflags) __releases(&dev->power.lock) __acquires(&dev->power.lock) { int (*callback)(struct device *); struct device *parent = NULL; int retval; trace_rpm_suspend(dev, rpmflags); repeat: retval = rpm_check_suspend_allowed(dev); if (retval < 0) goto out; /* Conditions are wrong. */ /* Synchronous suspends are not allowed in the RPM_RESUMING state. */ if (dev->power.runtime_status == RPM_RESUMING && !(rpmflags & RPM_ASYNC)) retval = -EAGAIN; if (retval) goto out; /* If the autosuspend_delay time hasn't expired yet, reschedule. */ if ((rpmflags & RPM_AUTO) && dev->power.runtime_status != RPM_SUSPENDING) { u64 expires = pm_runtime_autosuspend_expiration(dev); if (expires != 0) { /* Pending requests need to be canceled. */ dev->power.request = RPM_REQ_NONE; /* * Optimization: If the timer is already running and is * set to expire at or before the autosuspend delay, * avoid the overhead of resetting it. Just let it * expire; pm_suspend_timer_fn() will take care of the * rest. */ if (!(dev->power.timer_expires && dev->power.timer_expires <= expires)) { /* * We add a slack of 25% to gather wakeups * without sacrificing the granularity. */ u64 slack = (u64)READ_ONCE(dev->power.autosuspend_delay) * (NSEC_PER_MSEC >> 2); dev->power.timer_expires = expires; hrtimer_start_range_ns(&dev->power.suspend_timer, ns_to_ktime(expires), slack, HRTIMER_MODE_ABS); } dev->power.timer_autosuspends = 1; goto out; } } /* Other scheduled or pending requests need to be canceled. */ pm_runtime_cancel_pending(dev); if (dev->power.runtime_status == RPM_SUSPENDING) { DEFINE_WAIT(wait); if (rpmflags & (RPM_ASYNC | RPM_NOWAIT)) { retval = -EINPROGRESS; goto out; } if (dev->power.irq_safe) { spin_unlock(&dev->power.lock); cpu_relax(); spin_lock(&dev->power.lock); goto repeat; } /* Wait for the other suspend running in parallel with us. */ for (;;) { prepare_to_wait(&dev->power.wait_queue, &wait, TASK_UNINTERRUPTIBLE); if (dev->power.runtime_status != RPM_SUSPENDING) break; spin_unlock_irq(&dev->power.lock); schedule(); spin_lock_irq(&dev->power.lock); } finish_wait(&dev->power.wait_queue, &wait); goto repeat; } if (dev->power.no_callbacks) goto no_callback; /* Assume success. */ /* Carry out an asynchronous or a synchronous suspend. */ if (rpmflags & RPM_ASYNC) { dev->power.request = (rpmflags & RPM_AUTO) ? RPM_REQ_AUTOSUSPEND : RPM_REQ_SUSPEND; if (!dev->power.request_pending) { dev->power.request_pending = true; queue_work(pm_wq, &dev->power.work); } goto out; } __update_runtime_status(dev, RPM_SUSPENDING); callback = RPM_GET_CALLBACK(dev, runtime_suspend); dev_pm_enable_wake_irq_check(dev, true); retval = rpm_callback(callback, dev); if (retval) goto fail; dev_pm_enable_wake_irq_complete(dev); no_callback: __update_runtime_status(dev, RPM_SUSPENDED); pm_runtime_deactivate_timer(dev); if (dev->parent) { parent = dev->parent; atomic_add_unless(&parent->power.child_count, -1, 0); } wake_up_all(&dev->power.wait_queue); if (dev->power.deferred_resume) { dev->power.deferred_resume = false; rpm_resume(dev, 0); retval = -EAGAIN; goto out; } if (dev->power.irq_safe) goto out; /* Maybe the parent is now able to suspend. */ if (parent && !parent->power.ignore_children) { spin_unlock(&dev->power.lock); spin_lock(&parent->power.lock); rpm_idle(parent, RPM_ASYNC); spin_unlock(&parent->power.lock); spin_lock(&dev->power.lock); } /* Maybe the suppliers are now able to suspend. */ if (dev->power.links_count > 0) { spin_unlock_irq(&dev->power.lock); rpm_suspend_suppliers(dev); spin_lock_irq(&dev->power.lock); } out: trace_rpm_return_int(dev, _THIS_IP_, retval); return retval; fail: dev_pm_disable_wake_irq_check(dev, true); __update_runtime_status(dev, RPM_ACTIVE); dev->power.deferred_resume = false; wake_up_all(&dev->power.wait_queue); if (retval == -EAGAIN || retval == -EBUSY) { dev->power.runtime_error = 0; /* * If the callback routine failed an autosuspend, and * if the last_busy time has been updated so that there * is a new autosuspend expiration time, automatically * reschedule another autosuspend. */ if ((rpmflags & RPM_AUTO) && pm_runtime_autosuspend_expiration(dev) != 0) goto repeat; } else { pm_runtime_cancel_pending(dev); } goto out; } /** * rpm_resume - Carry out runtime resume of given device. * @dev: Device to resume. * @rpmflags: Flag bits. * * Check if the device's runtime PM status allows it to be resumed. Cancel * any scheduled or pending requests. If another resume has been started * earlier, either return immediately or wait for it to finish, depending on the * RPM_NOWAIT and RPM_ASYNC flags. Similarly, if there's a suspend running in * parallel with this function, either tell the other process to resume after * suspending (deferred_resume) or wait for it to finish. If the RPM_ASYNC * flag is set then queue a resume request; otherwise run the * ->runtime_resume() callback directly. Queue an idle notification for the * device if the resume succeeded. * * This function must be called under dev->power.lock with interrupts disabled. */ static int rpm_resume(struct device *dev, int rpmflags) __releases(&dev->power.lock) __acquires(&dev->power.lock) { int (*callback)(struct device *); struct device *parent = NULL; int retval = 0; trace_rpm_resume(dev, rpmflags); repeat: if (dev->power.runtime_error) { retval = -EINVAL; } else if (dev->power.disable_depth > 0) { if (dev->power.runtime_status == RPM_ACTIVE && dev->power.last_status == RPM_ACTIVE) retval = 1; else retval = -EACCES; } if (retval) goto out; /* * Other scheduled or pending requests need to be canceled. Small * optimization: If an autosuspend timer is running, leave it running * rather than cancelling it now only to restart it again in the near * future. */ dev->power.request = RPM_REQ_NONE; if (!dev->power.timer_autosuspends) pm_runtime_deactivate_timer(dev); if (dev->power.runtime_status == RPM_ACTIVE) { retval = 1; goto out; } if (dev->power.runtime_status == RPM_RESUMING || dev->power.runtime_status == RPM_SUSPENDING) { DEFINE_WAIT(wait); if (rpmflags & (RPM_ASYNC | RPM_NOWAIT)) { if (dev->power.runtime_status == RPM_SUSPENDING) { dev->power.deferred_resume = true; if (rpmflags & RPM_NOWAIT) retval = -EINPROGRESS; } else { retval = -EINPROGRESS; } goto out; } if (dev->power.irq_safe) { spin_unlock(&dev->power.lock); cpu_relax(); spin_lock(&dev->power.lock); goto repeat; } /* Wait for the operation carried out in parallel with us. */ for (;;) { prepare_to_wait(&dev->power.wait_queue, &wait, TASK_UNINTERRUPTIBLE); if (dev->power.runtime_status != RPM_RESUMING && dev->power.runtime_status != RPM_SUSPENDING) break; spin_unlock_irq(&dev->power.lock); schedule(); spin_lock_irq(&dev->power.lock); } finish_wait(&dev->power.wait_queue, &wait); goto repeat; } /* * See if we can skip waking up the parent. This is safe only if * power.no_callbacks is set, because otherwise we don't know whether * the resume will actually succeed. */ if (dev->power.no_callbacks && !parent && dev->parent) { spin_lock_nested(&dev->parent->power.lock, SINGLE_DEPTH_NESTING); if (dev->parent->power.disable_depth > 0 || dev->parent->power.ignore_children || dev->parent->power.runtime_status == RPM_ACTIVE) { atomic_inc(&dev->parent->power.child_count); spin_unlock(&dev->parent->power.lock); retval = 1; goto no_callback; /* Assume success. */ } spin_unlock(&dev->parent->power.lock); } /* Carry out an asynchronous or a synchronous resume. */ if (rpmflags & RPM_ASYNC) { dev->power.request = RPM_REQ_RESUME; if (!dev->power.request_pending) { dev->power.request_pending = true; queue_work(pm_wq, &dev->power.work); } retval = 0; goto out; } if (!parent && dev->parent) { /* * Increment the parent's usage counter and resume it if * necessary. Not needed if dev is irq-safe; then the * parent is permanently resumed. */ parent = dev->parent; if (dev->power.irq_safe) goto skip_parent; spin_unlock(&dev->power.lock); pm_runtime_get_noresume(parent); spin_lock(&parent->power.lock); /* * Resume the parent if it has runtime PM enabled and not been * set to ignore its children. */ if (!parent->power.disable_depth && !parent->power.ignore_children) { rpm_resume(parent, 0); if (parent->power.runtime_status != RPM_ACTIVE) retval = -EBUSY; } spin_unlock(&parent->power.lock); spin_lock(&dev->power.lock); if (retval) goto out; goto repeat; } skip_parent: if (dev->power.no_callbacks) goto no_callback; /* Assume success. */ __update_runtime_status(dev, RPM_RESUMING); callback = RPM_GET_CALLBACK(dev, runtime_resume); dev_pm_disable_wake_irq_check(dev, false); retval = rpm_callback(callback, dev); if (retval) { __update_runtime_status(dev, RPM_SUSPENDED); pm_runtime_cancel_pending(dev); dev_pm_enable_wake_irq_check(dev, false); } else { no_callback: __update_runtime_status(dev, RPM_ACTIVE); pm_runtime_mark_last_busy(dev); if (parent) atomic_inc(&parent->power.child_count); } wake_up_all(&dev->power.wait_queue); if (retval >= 0) rpm_idle(dev, RPM_ASYNC); out: if (parent && !dev->power.irq_safe) { spin_unlock_irq(&dev->power.lock); pm_runtime_put(parent); spin_lock_irq(&dev->power.lock); } trace_rpm_return_int(dev, _THIS_IP_, retval); return retval; } /** * pm_runtime_work - Universal runtime PM work function. * @work: Work structure used for scheduling the execution of this function. * * Use @work to get the device object the work is to be done for, determine what * is to be done and execute the appropriate runtime PM function. */ static void pm_runtime_work(struct work_struct *work) { struct device *dev = container_of(work, struct device, power.work); enum rpm_request req; spin_lock_irq(&dev->power.lock); if (!dev->power.request_pending) goto out; req = dev->power.request; dev->power.request = RPM_REQ_NONE; dev->power.request_pending = false; switch (req) { case RPM_REQ_NONE: break; case RPM_REQ_IDLE: rpm_idle(dev, RPM_NOWAIT); break; case RPM_REQ_SUSPEND: rpm_suspend(dev, RPM_NOWAIT); break; case RPM_REQ_AUTOSUSPEND: rpm_suspend(dev, RPM_NOWAIT | RPM_AUTO); break; case RPM_REQ_RESUME: rpm_resume(dev, RPM_NOWAIT); break; } out: spin_unlock_irq(&dev->power.lock); } /** * pm_suspend_timer_fn - Timer function for pm_schedule_suspend(). * @timer: hrtimer used by pm_schedule_suspend(). * * Check if the time is right and queue a suspend request. */ static enum hrtimer_restart pm_suspend_timer_fn(struct hrtimer *timer) { struct device *dev = container_of(timer, struct device, power.suspend_timer); unsigned long flags; u64 expires; spin_lock_irqsave(&dev->power.lock, flags); expires = dev->power.timer_expires; /* * If 'expires' is after the current time, we've been called * too early. */ if (expires > 0 && expires < ktime_get_mono_fast_ns()) { dev->power.timer_expires = 0; rpm_suspend(dev, dev->power.timer_autosuspends ? (RPM_ASYNC | RPM_AUTO) : RPM_ASYNC); } spin_unlock_irqrestore(&dev->power.lock, flags); return HRTIMER_NORESTART; } /** * pm_schedule_suspend - Set up a timer to submit a suspend request in future. * @dev: Device to suspend. * @delay: Time to wait before submitting a suspend request, in milliseconds. */ int pm_schedule_suspend(struct device *dev, unsigned int delay) { unsigned long flags; u64 expires; int retval; spin_lock_irqsave(&dev->power.lock, flags); if (!delay) { retval = rpm_suspend(dev, RPM_ASYNC); goto out; } retval = rpm_check_suspend_allowed(dev); if (retval) goto out; /* Other scheduled or pending requests need to be canceled. */ pm_runtime_cancel_pending(dev); expires = ktime_get_mono_fast_ns() + (u64)delay * NSEC_PER_MSEC; dev->power.timer_expires = expires; dev->power.timer_autosuspends = 0; hrtimer_start(&dev->power.suspend_timer, expires, HRTIMER_MODE_ABS); out: spin_unlock_irqrestore(&dev->power.lock, flags); return retval; } EXPORT_SYMBOL_GPL(pm_schedule_suspend); static int rpm_drop_usage_count(struct device *dev) { int ret; ret = atomic_sub_return(1, &dev->power.usage_count); if (ret >= 0) return ret; /* * Because rpm_resume() does not check the usage counter, it will resume * the device even if the usage counter is 0 or negative, so it is * sufficient to increment the usage counter here to reverse the change * made above. */ atomic_inc(&dev->power.usage_count); dev_warn(dev, "Runtime PM usage count underflow!\n"); return -EINVAL; } /** * __pm_runtime_idle - Entry point for runtime idle operations. * @dev: Device to send idle notification for. * @rpmflags: Flag bits. * * If the RPM_GET_PUT flag is set, decrement the device's usage count and * return immediately if it is larger than zero (if it becomes negative, log a * warning, increment it, and return an error). Then carry out an idle * notification, either synchronous or asynchronous. * * This routine may be called in atomic context if the RPM_ASYNC flag is set, * or if pm_runtime_irq_safe() has been called. */ int __pm_runtime_idle(struct device *dev, int rpmflags) { unsigned long flags; int retval; if (rpmflags & RPM_GET_PUT) { retval = rpm_drop_usage_count(dev); if (retval < 0) { return retval; } else if (retval > 0) { trace_rpm_usage(dev, rpmflags); return 0; } } might_sleep_if(!(rpmflags & RPM_ASYNC) && !dev->power.irq_safe); spin_lock_irqsave(&dev->power.lock, flags); retval = rpm_idle(dev, rpmflags); spin_unlock_irqrestore(&dev->power.lock, flags); return retval; } EXPORT_SYMBOL_GPL(__pm_runtime_idle); /** * __pm_runtime_suspend - Entry point for runtime put/suspend operations. * @dev: Device to suspend. * @rpmflags: Flag bits. * * If the RPM_GET_PUT flag is set, decrement the device's usage count and * return immediately if it is larger than zero (if it becomes negative, log a * warning, increment it, and return an error). Then carry out a suspend, * either synchronous or asynchronous. * * This routine may be called in atomic context if the RPM_ASYNC flag is set, * or if pm_runtime_irq_safe() has been called. */ int __pm_runtime_suspend(struct device *dev, int rpmflags) { unsigned long flags; int retval; if (rpmflags & RPM_GET_PUT) { retval = rpm_drop_usage_count(dev); if (retval < 0) { return retval; } else if (retval > 0) { trace_rpm_usage(dev, rpmflags); return 0; } } might_sleep_if(!(rpmflags & RPM_ASYNC) && !dev->power.irq_safe); spin_lock_irqsave(&dev->power.lock, flags); retval = rpm_suspend(dev, rpmflags); spin_unlock_irqrestore(&dev->power.lock, flags); return retval; } EXPORT_SYMBOL_GPL(__pm_runtime_suspend); /** * __pm_runtime_resume - Entry point for runtime resume operations. * @dev: Device to resume. * @rpmflags: Flag bits. * * If the RPM_GET_PUT flag is set, increment the device's usage count. Then * carry out a resume, either synchronous or asynchronous. * * This routine may be called in atomic context if the RPM_ASYNC flag is set, * or if pm_runtime_irq_safe() has been called. */ int __pm_runtime_resume(struct device *dev, int rpmflags) { unsigned long flags; int retval; might_sleep_if(!(rpmflags & RPM_ASYNC) && !dev->power.irq_safe && dev->power.runtime_status != RPM_ACTIVE); if (rpmflags & RPM_GET_PUT) atomic_inc(&dev->power.usage_count); spin_lock_irqsave(&dev->power.lock, flags); retval = rpm_resume(dev, rpmflags); spin_unlock_irqrestore(&dev->power.lock, flags); return retval; } EXPORT_SYMBOL_GPL(__pm_runtime_resume); /** * pm_runtime_get_if_active - Conditionally bump up device usage counter. * @dev: Device to handle. * @ign_usage_count: Whether or not to look at the current usage counter value. * * Return -EINVAL if runtime PM is disabled for @dev. * * Otherwise, if the runtime PM status of @dev is %RPM_ACTIVE and either * @ign_usage_count is %true or the runtime PM usage counter of @dev is not * zero, increment the usage counter of @dev and return 1. Otherwise, return 0 * without changing the usage counter. * * If @ign_usage_count is %true, this function can be used to prevent suspending * the device when its runtime PM status is %RPM_ACTIVE. * * If @ign_usage_count is %false, this function can be used to prevent * suspending the device when both its runtime PM status is %RPM_ACTIVE and its * runtime PM usage counter is not zero. * * The caller is responsible for decrementing the runtime PM usage counter of * @dev after this function has returned a positive value for it. */ int pm_runtime_get_if_active(struct device *dev, bool ign_usage_count) { unsigned long flags; int retval; spin_lock_irqsave(&dev->power.lock, flags); if (dev->power.disable_depth > 0) { retval = -EINVAL; } else if (dev->power.runtime_status != RPM_ACTIVE) { retval = 0; } else if (ign_usage_count) { retval = 1; atomic_inc(&dev->power.usage_count); } else { retval = atomic_inc_not_zero(&dev->power.usage_count); } trace_rpm_usage(dev, 0); spin_unlock_irqrestore(&dev->power.lock, flags); return retval; } EXPORT_SYMBOL_GPL(pm_runtime_get_if_active); /** * __pm_runtime_set_status - Set runtime PM status of a device. * @dev: Device to handle. * @status: New runtime PM status of the device. * * If runtime PM of the device is disabled or its power.runtime_error field is * different from zero, the status may be changed either to RPM_ACTIVE, or to * RPM_SUSPENDED, as long as that reflects the actual state of the device. * However, if the device has a parent and the parent is not active, and the * parent's power.ignore_children flag is unset, the device's status cannot be * set to RPM_ACTIVE, so -EBUSY is returned in that case. * * If successful, __pm_runtime_set_status() clears the power.runtime_error field * and the device parent's counter of unsuspended children is modified to * reflect the new status. If the new status is RPM_SUSPENDED, an idle * notification request for the parent is submitted. * * If @dev has any suppliers (as reflected by device links to them), and @status * is RPM_ACTIVE, they will be activated upfront and if the activation of one * of them fails, the status of @dev will be changed to RPM_SUSPENDED (instead * of the @status value) and the suppliers will be deacticated on exit. The * error returned by the failing supplier activation will be returned in that * case. */ int __pm_runtime_set_status(struct device *dev, unsigned int status) { struct device *parent = dev->parent; bool notify_parent = false; unsigned long flags; int error = 0; if (status != RPM_ACTIVE && status != RPM_SUSPENDED) return -EINVAL; spin_lock_irqsave(&dev->power.lock, flags); /* * Prevent PM-runtime from being enabled for the device or return an * error if it is enabled already and working. */ if (dev->power.runtime_error || dev->power.disable_depth) dev->power.disable_depth++; else error = -EAGAIN; spin_unlock_irqrestore(&dev->power.lock, flags); if (error) return error; /* * If the new status is RPM_ACTIVE, the suppliers can be activated * upfront regardless of the current status, because next time * rpm_put_suppliers() runs, the rpm_active refcounts of the links * involved will be dropped down to one anyway. */ if (status == RPM_ACTIVE) { int idx = device_links_read_lock(); error = rpm_get_suppliers(dev); if (error) status = RPM_SUSPENDED; device_links_read_unlock(idx); } spin_lock_irqsave(&dev->power.lock, flags); if (dev->power.runtime_status == status || !parent) goto out_set; if (status == RPM_SUSPENDED) { atomic_add_unless(&parent->power.child_count, -1, 0); notify_parent = !parent->power.ignore_children; } else { spin_lock_nested(&parent->power.lock, SINGLE_DEPTH_NESTING); /* * It is invalid to put an active child under a parent that is * not active, has runtime PM enabled and the * 'power.ignore_children' flag unset. */ if (!parent->power.disable_depth && !parent->power.ignore_children && parent->power.runtime_status != RPM_ACTIVE) { dev_err(dev, "runtime PM trying to activate child device %s but parent (%s) is not active\n", dev_name(dev), dev_name(parent)); error = -EBUSY; } else if (dev->power.runtime_status == RPM_SUSPENDED) { atomic_inc(&parent->power.child_count); } spin_unlock(&parent->power.lock); if (error) { status = RPM_SUSPENDED; goto out; } } out_set: __update_runtime_status(dev, status); if (!error) dev->power.runtime_error = 0; out: spin_unlock_irqrestore(&dev->power.lock, flags); if (notify_parent) pm_request_idle(parent); if (status == RPM_SUSPENDED) { int idx = device_links_read_lock(); rpm_put_suppliers(dev); device_links_read_unlock(idx); } pm_runtime_enable(dev); return error; } EXPORT_SYMBOL_GPL(__pm_runtime_set_status); /** * __pm_runtime_barrier - Cancel pending requests and wait for completions. * @dev: Device to handle. * * Flush all pending requests for the device from pm_wq and wait for all * runtime PM operations involving the device in progress to complete. * * Should be called under dev->power.lock with interrupts disabled. */ static void __pm_runtime_barrier(struct device *dev) { pm_runtime_deactivate_timer(dev); if (dev->power.request_pending) { dev->power.request = RPM_REQ_NONE; spin_unlock_irq(&dev->power.lock); cancel_work_sync(&dev->power.work); spin_lock_irq(&dev->power.lock); dev->power.request_pending = false; } if (dev->power.runtime_status == RPM_SUSPENDING || dev->power.runtime_status == RPM_RESUMING || dev->power.idle_notification) { DEFINE_WAIT(wait); /* Suspend, wake-up or idle notification in progress. */ for (;;) { prepare_to_wait(&dev->power.wait_queue, &wait, TASK_UNINTERRUPTIBLE); if (dev->power.runtime_status != RPM_SUSPENDING && dev->power.runtime_status != RPM_RESUMING && !dev->power.idle_notification) break; spin_unlock_irq(&dev->power.lock); schedule(); spin_lock_irq(&dev->power.lock); } finish_wait(&dev->power.wait_queue, &wait); } } /** * pm_runtime_barrier - Flush pending requests and wait for completions. * @dev: Device to handle. * * Prevent the device from being suspended by incrementing its usage counter and * if there's a pending resume request for the device, wake the device up. * Next, make sure that all pending requests for the device have been flushed * from pm_wq and wait for all runtime PM operations involving the device in * progress to complete. * * Return value: * 1, if there was a resume request pending and the device had to be woken up, * 0, otherwise */ int pm_runtime_barrier(struct device *dev) { int retval = 0; pm_runtime_get_noresume(dev); spin_lock_irq(&dev->power.lock); if (dev->power.request_pending && dev->power.request == RPM_REQ_RESUME) { rpm_resume(dev, 0); retval = 1; } __pm_runtime_barrier(dev); spin_unlock_irq(&dev->power.lock); pm_runtime_put_noidle(dev); return retval; } EXPORT_SYMBOL_GPL(pm_runtime_barrier); /** * __pm_runtime_disable - Disable runtime PM of a device. * @dev: Device to handle. * @check_resume: If set, check if there's a resume request for the device. * * Increment power.disable_depth for the device and if it was zero previously, * cancel all pending runtime PM requests for the device and wait for all * operations in progress to complete. The device can be either active or * suspended after its runtime PM has been disabled. * * If @check_resume is set and there's a resume request pending when * __pm_runtime_disable() is called and power.disable_depth is zero, the * function will wake up the device before disabling its runtime PM. */ void __pm_runtime_disable(struct device *dev, bool check_resume) { spin_lock_irq(&dev->power.lock); if (dev->power.disable_depth > 0) { dev->power.disable_depth++; goto out; } /* * Wake up the device if there's a resume request pending, because that * means there probably is some I/O to process and disabling runtime PM * shouldn't prevent the device from processing the I/O. */ if (check_resume && dev->power.request_pending && dev->power.request == RPM_REQ_RESUME) { /* * Prevent suspends and idle notifications from being carried * out after we have woken up the device. */ pm_runtime_get_noresume(dev); rpm_resume(dev, 0); pm_runtime_put_noidle(dev); } /* Update time accounting before disabling PM-runtime. */ update_pm_runtime_accounting(dev); if (!dev->power.disable_depth++) { __pm_runtime_barrier(dev); dev->power.last_status = dev->power.runtime_status; } out: spin_unlock_irq(&dev->power.lock); } EXPORT_SYMBOL_GPL(__pm_runtime_disable); /** * pm_runtime_enable - Enable runtime PM of a device. * @dev: Device to handle. */ void pm_runtime_enable(struct device *dev) { unsigned long flags; spin_lock_irqsave(&dev->power.lock, flags); if (!dev->power.disable_depth) { dev_warn(dev, "Unbalanced %s!\n", __func__); goto out; } if (--dev->power.disable_depth > 0) goto out; dev->power.last_status = RPM_INVALID; dev->power.accounting_timestamp = ktime_get_mono_fast_ns(); if (dev->power.runtime_status == RPM_SUSPENDED && !dev->power.ignore_children && atomic_read(&dev->power.child_count) > 0) dev_warn(dev, "Enabling runtime PM for inactive device with active children\n"); out: spin_unlock_irqrestore(&dev->power.lock, flags); } EXPORT_SYMBOL_GPL(pm_runtime_enable); static void pm_runtime_disable_action(void *data) { pm_runtime_dont_use_autosuspend(data); pm_runtime_disable(data); } /** * devm_pm_runtime_enable - devres-enabled version of pm_runtime_enable. * * NOTE: this will also handle calling pm_runtime_dont_use_autosuspend() for * you at driver exit time if needed. * * @dev: Device to handle. */ int devm_pm_runtime_enable(struct device *dev) { pm_runtime_enable(dev); return devm_add_action_or_reset(dev, pm_runtime_disable_action, dev); } EXPORT_SYMBOL_GPL(devm_pm_runtime_enable); /** * pm_runtime_forbid - Block runtime PM of a device. * @dev: Device to handle. * * Increase the device's usage count and clear its power.runtime_auto flag, * so that it cannot be suspended at run time until pm_runtime_allow() is called * for it. */ void pm_runtime_forbid(struct device *dev) { spin_lock_irq(&dev->power.lock); if (!dev->power.runtime_auto) goto out; dev->power.runtime_auto = false; atomic_inc(&dev->power.usage_count); rpm_resume(dev, 0); out: spin_unlock_irq(&dev->power.lock); } EXPORT_SYMBOL_GPL(pm_runtime_forbid); /** * pm_runtime_allow - Unblock runtime PM of a device. * @dev: Device to handle. * * Decrease the device's usage count and set its power.runtime_auto flag. */ void pm_runtime_allow(struct device *dev) { int ret; spin_lock_irq(&dev->power.lock); if (dev->power.runtime_auto) goto out; dev->power.runtime_auto = true; ret = rpm_drop_usage_count(dev); if (ret == 0) rpm_idle(dev, RPM_AUTO | RPM_ASYNC); else if (ret > 0) trace_rpm_usage(dev, RPM_AUTO | RPM_ASYNC); out: spin_unlock_irq(&dev->power.lock); } EXPORT_SYMBOL_GPL(pm_runtime_allow); /** * pm_runtime_no_callbacks - Ignore runtime PM callbacks for a device. * @dev: Device to handle. * * Set the power.no_callbacks flag, which tells the PM core that this * device is power-managed through its parent and has no runtime PM * callbacks of its own. The runtime sysfs attributes will be removed. */ void pm_runtime_no_callbacks(struct device *dev) { spin_lock_irq(&dev->power.lock); dev->power.no_callbacks = 1; spin_unlock_irq(&dev->power.lock); if (device_is_registered(dev)) rpm_sysfs_remove(dev); } EXPORT_SYMBOL_GPL(pm_runtime_no_callbacks); /** * pm_runtime_irq_safe - Leave interrupts disabled during callbacks. * @dev: Device to handle * * Set the power.irq_safe flag, which tells the PM core that the * ->runtime_suspend() and ->runtime_resume() callbacks for this device should * always be invoked with the spinlock held and interrupts disabled. It also * causes the parent's usage counter to be permanently incremented, preventing * the parent from runtime suspending -- otherwise an irq-safe child might have * to wait for a non-irq-safe parent. */ void pm_runtime_irq_safe(struct device *dev) { if (dev->parent) pm_runtime_get_sync(dev->parent); spin_lock_irq(&dev->power.lock); dev->power.irq_safe = 1; spin_unlock_irq(&dev->power.lock); } EXPORT_SYMBOL_GPL(pm_runtime_irq_safe); /** * update_autosuspend - Handle a change to a device's autosuspend settings. * @dev: Device to handle. * @old_delay: The former autosuspend_delay value. * @old_use: The former use_autosuspend value. * * Prevent runtime suspend if the new delay is negative and use_autosuspend is * set; otherwise allow it. Send an idle notification if suspends are allowed. * * This function must be called under dev->power.lock with interrupts disabled. */ static void update_autosuspend(struct device *dev, int old_delay, int old_use) { int delay = dev->power.autosuspend_delay; /* Should runtime suspend be prevented now? */ if (dev->power.use_autosuspend && delay < 0) { /* If it used to be allowed then prevent it. */ if (!old_use || old_delay >= 0) { atomic_inc(&dev->power.usage_count); rpm_resume(dev, 0); } else { trace_rpm_usage(dev, 0); } } /* Runtime suspend should be allowed now. */ else { /* If it used to be prevented then allow it. */ if (old_use && old_delay < 0) atomic_dec(&dev->power.usage_count); /* Maybe we can autosuspend now. */ rpm_idle(dev, RPM_AUTO); } } /** * pm_runtime_set_autosuspend_delay - Set a device's autosuspend_delay value. * @dev: Device to handle. * @delay: Value of the new delay in milliseconds. * * Set the device's power.autosuspend_delay value. If it changes to negative * and the power.use_autosuspend flag is set, prevent runtime suspends. If it * changes the other way, allow runtime suspends. */ void pm_runtime_set_autosuspend_delay(struct device *dev, int delay) { int old_delay, old_use; spin_lock_irq(&dev->power.lock); old_delay = dev->power.autosuspend_delay; old_use = dev->power.use_autosuspend; dev->power.autosuspend_delay = delay; update_autosuspend(dev, old_delay, old_use); spin_unlock_irq(&dev->power.lock); } EXPORT_SYMBOL_GPL(pm_runtime_set_autosuspend_delay); /** * __pm_runtime_use_autosuspend - Set a device's use_autosuspend flag. * @dev: Device to handle. * @use: New value for use_autosuspend. * * Set the device's power.use_autosuspend flag, and allow or prevent runtime * suspends as needed. */ void __pm_runtime_use_autosuspend(struct device *dev, bool use) { int old_delay, old_use; spin_lock_irq(&dev->power.lock); old_delay = dev->power.autosuspend_delay; old_use = dev->power.use_autosuspend; dev->power.use_autosuspend = use; update_autosuspend(dev, old_delay, old_use); spin_unlock_irq(&dev->power.lock); } EXPORT_SYMBOL_GPL(__pm_runtime_use_autosuspend); /** * pm_runtime_init - Initialize runtime PM fields in given device object. * @dev: Device object to initialize. */ void pm_runtime_init(struct device *dev) { dev->power.runtime_status = RPM_SUSPENDED; dev->power.last_status = RPM_INVALID; dev->power.idle_notification = false; dev->power.disable_depth = 1; atomic_set(&dev->power.usage_count, 0); dev->power.runtime_error = 0; atomic_set(&dev->power.child_count, 0); pm_suspend_ignore_children(dev, false); dev->power.runtime_auto = true; dev->power.request_pending = false; dev->power.request = RPM_REQ_NONE; dev->power.deferred_resume = false; dev->power.needs_force_resume = 0; INIT_WORK(&dev->power.work, pm_runtime_work); dev->power.timer_expires = 0; hrtimer_init(&dev->power.suspend_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); dev->power.suspend_timer.function = pm_suspend_timer_fn; init_waitqueue_head(&dev->power.wait_queue); } /** * pm_runtime_reinit - Re-initialize runtime PM fields in given device object. * @dev: Device object to re-initialize. */ void pm_runtime_reinit(struct device *dev) { if (!pm_runtime_enabled(dev)) { if (dev->power.runtime_status == RPM_ACTIVE) pm_runtime_set_suspended(dev); if (dev->power.irq_safe) { spin_lock_irq(&dev->power.lock); dev->power.irq_safe = 0; spin_unlock_irq(&dev->power.lock); if (dev->parent) pm_runtime_put(dev->parent); } } } /** * pm_runtime_remove - Prepare for removing a device from device hierarchy. * @dev: Device object being removed from device hierarchy. */ void pm_runtime_remove(struct device *dev) { __pm_runtime_disable(dev, false); pm_runtime_reinit(dev); } /** * pm_runtime_get_suppliers - Resume and reference-count supplier devices. * @dev: Consumer device. */ void pm_runtime_get_suppliers(struct device *dev) { struct device_link *link; int idx; idx = device_links_read_lock(); list_for_each_entry_rcu(link, &dev->links.suppliers, c_node, device_links_read_lock_held()) if (link->flags & DL_FLAG_PM_RUNTIME) { link->supplier_preactivated = true; pm_runtime_get_sync(link->supplier); } device_links_read_unlock(idx); } /** * pm_runtime_put_suppliers - Drop references to supplier devices. * @dev: Consumer device. */ void pm_runtime_put_suppliers(struct device *dev) { struct device_link *link; int idx; idx = device_links_read_lock(); list_for_each_entry_rcu(link, &dev->links.suppliers, c_node, device_links_read_lock_held()) if (link->supplier_preactivated) { link->supplier_preactivated = false; pm_runtime_put(link->supplier); } device_links_read_unlock(idx); } void pm_runtime_new_link(struct device *dev) { spin_lock_irq(&dev->power.lock); dev->power.links_count++; spin_unlock_irq(&dev->power.lock); } static void pm_runtime_drop_link_count(struct device *dev) { spin_lock_irq(&dev->power.lock); WARN_ON(dev->power.links_count == 0); dev->power.links_count--; spin_unlock_irq(&dev->power.lock); } /** * pm_runtime_drop_link - Prepare for device link removal. * @link: Device link going away. * * Drop the link count of the consumer end of @link and decrement the supplier * device's runtime PM usage counter as many times as needed to drop all of the * PM runtime reference to it from the consumer. */ void pm_runtime_drop_link(struct device_link *link) { if (!(link->flags & DL_FLAG_PM_RUNTIME)) return; pm_runtime_drop_link_count(link->consumer); pm_runtime_release_supplier(link); pm_request_idle(link->supplier); } static bool pm_runtime_need_not_resume(struct device *dev) { return atomic_read(&dev->power.usage_count) <= 1 && (atomic_read(&dev->power.child_count) == 0 || dev->power.ignore_children); } /** * pm_runtime_force_suspend - Force a device into suspend state if needed. * @dev: Device to suspend. * * Disable runtime PM so we safely can check the device's runtime PM status and * if it is active, invoke its ->runtime_suspend callback to suspend it and * change its runtime PM status field to RPM_SUSPENDED. Also, if the device's * usage and children counters don't indicate that the device was in use before * the system-wide transition under way, decrement its parent's children counter * (if there is a parent). Keep runtime PM disabled to preserve the state * unless we encounter errors. * * Typically this function may be invoked from a system suspend callback to make * sure the device is put into low power state and it should only be used during * system-wide PM transitions to sleep states. It assumes that the analogous * pm_runtime_force_resume() will be used to resume the device. * * Do not use with DPM_FLAG_SMART_SUSPEND as this can lead to an inconsistent * state where this function has called the ->runtime_suspend callback but the * PM core marks the driver as runtime active. */ int pm_runtime_force_suspend(struct device *dev) { int (*callback)(struct device *); int ret; pm_runtime_disable(dev); if (pm_runtime_status_suspended(dev)) return 0; callback = RPM_GET_CALLBACK(dev, runtime_suspend); dev_pm_enable_wake_irq_check(dev, true); ret = callback ? callback(dev) : 0; if (ret) goto err; dev_pm_enable_wake_irq_complete(dev); /* * If the device can stay in suspend after the system-wide transition * to the working state that will follow, drop the children counter of * its parent, but set its status to RPM_SUSPENDED anyway in case this * function will be called again for it in the meantime. */ if (pm_runtime_need_not_resume(dev)) { pm_runtime_set_suspended(dev); } else { __update_runtime_status(dev, RPM_SUSPENDED); dev->power.needs_force_resume = 1; } return 0; err: dev_pm_disable_wake_irq_check(dev, true); pm_runtime_enable(dev); return ret; } EXPORT_SYMBOL_GPL(pm_runtime_force_suspend); /** * pm_runtime_force_resume - Force a device into resume state if needed. * @dev: Device to resume. * * Prior invoking this function we expect the user to have brought the device * into low power state by a call to pm_runtime_force_suspend(). Here we reverse * those actions and bring the device into full power, if it is expected to be * used on system resume. In the other case, we defer the resume to be managed * via runtime PM. * * Typically this function may be invoked from a system resume callback. */ int pm_runtime_force_resume(struct device *dev) { int (*callback)(struct device *); int ret = 0; if (!pm_runtime_status_suspended(dev) || !dev->power.needs_force_resume) goto out; /* * The value of the parent's children counter is correct already, so * just update the status of the device. */ __update_runtime_status(dev, RPM_ACTIVE); callback = RPM_GET_CALLBACK(dev, runtime_resume); dev_pm_disable_wake_irq_check(dev, false); ret = callback ? callback(dev) : 0; if (ret) { pm_runtime_set_suspended(dev); dev_pm_enable_wake_irq_check(dev, false); goto out; } pm_runtime_mark_last_busy(dev); out: dev->power.needs_force_resume = 0; pm_runtime_enable(dev); return ret; } EXPORT_SYMBOL_GPL(pm_runtime_force_resume); |
412 157 157 20 228 40 35 35 19 19 42 19 42 42 40 167 1 59 352 6 181 492 434 261 214 214 214 5 5 1 5 137 137 12 204 204 203 203 203 5 198 203 203 203 203 203 204 204 204 203 173 204 204 1 204 204 204 204 214 214 218 218 12 12 12 12 12 54 39 47 47 47 47 47 47 47 47 47 214 214 214 54 208 208 208 208 208 208 214 47 208 208 208 208 208 5 216 216 24 192 192 192 216 216 216 216 24 166 134 134 134 134 134 141 166 158 158 158 59 158 166 166 166 166 81 304 142 304 417 56 417 15 15 14 14 15 15 15 15 15 158 166 35 34 107 157 126 157 165 141 141 141 216 190 245 214 214 210 210 208 208 208 208 208 208 208 208 208 204 208 208 138 208 204 239 208 138 204 204 204 204 204 204 8 8 239 208 24 24 239 212 239 212 214 214 214 214 214 214 214 214 214 214 239 239 214 214 214 26 212 212 212 212 239 214 214 214 214 222 221 220 220 220 18 216 184 216 5 216 216 25 25 25 25 25 56 55 56 56 2 2 56 25 25 166 166 166 165 59 165 165 165 165 165 165 165 25 25 25 25 25 15 25 25 25 25 18 54 54 54 46 46 46 35 35 21 20 20 54 45 53 53 35 23 19 53 19 4 16 15 52 25 38 52 3 3 4 2 2 1 1 1 2 1 1 1 19 19 17 16 19 14 2 4 11 12 34 10 8 7 7 10 1 1 1 1 1 1 4 3 2 2 5 262 262 11 13 20 20 498 499 498 499 62 54 53 8 7 6 61 158 158 158 158 158 158 49 166 166 166 166 166 166 166 166 165 210 28 28 3 25 340 340 231 231 340 340 259 259 259 258 258 111 111 1 1 1 56 56 56 56 56 52 52 6 4 72 72 72 72 15 15 15 72 15 15 15 1 1 72 15 72 72 15 15 15 15 15 15 15 72 72 15 15 72 72 7 72 15 72 15 15 72 72 72 1 24 24 24 24 24 24 24 24 25 25 22 24 25 25 24 1 157 157 157 157 107 157 157 157 143 157 157 157 157 157 157 157 157 157 141 141 141 141 141 141 141 141 141 62 141 141 107 141 142 142 141 141 142 142 142 141 141 140 4 81 81 81 15 15 15 15 1244 59 87 88 86 60 60 59 86 14 85 82 3262 3263 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 | /* * Generic process-grouping system. * * Based originally on the cpuset system, extracted by Paul Menage * Copyright (C) 2006 Google, Inc * * Notifications support * Copyright (C) 2009 Nokia Corporation * Author: Kirill A. Shutemov * * Copyright notices from the original cpuset code: * -------------------------------------------------- * Copyright (C) 2003 BULL SA. * Copyright (C) 2004-2006 Silicon Graphics, Inc. * * Portions derived from Patrick Mochel's sysfs code. * sysfs is Copyright (c) 2001-3 Patrick Mochel * * 2003-10-10 Written by Simon Derr. * 2003-10-22 Updates by Stephen Hemminger. * 2004 May-July Rework by Paul Jackson. * --------------------------------------------------- * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of the Linux * distribution for more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "cgroup-internal.h" #include <linux/bpf-cgroup.h> #include <linux/cred.h> #include <linux/errno.h> #include <linux/init_task.h> #include <linux/kernel.h> #include <linux/magic.h> #include <linux/mutex.h> #include <linux/mount.h> #include <linux/pagemap.h> #include <linux/proc_fs.h> #include <linux/rcupdate.h> #include <linux/sched.h> #include <linux/sched/task.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/percpu-rwsem.h> #include <linux/string.h> #include <linux/hashtable.h> #include <linux/idr.h> #include <linux/kthread.h> #include <linux/atomic.h> #include <linux/cpuset.h> #include <linux/proc_ns.h> #include <linux/nsproxy.h> #include <linux/file.h> #include <linux/fs_parser.h> #include <linux/sched/cputime.h> #include <linux/sched/deadline.h> #include <linux/psi.h> #include <net/sock.h> #define CREATE_TRACE_POINTS #include <trace/events/cgroup.h> #define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \ MAX_CFTYPE_NAME + 2) /* let's not notify more than 100 times per second */ #define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100) /* * To avoid confusing the compiler (and generating warnings) with code * that attempts to access what would be a 0-element array (i.e. sized * to a potentially empty array when CGROUP_SUBSYS_COUNT == 0), this * constant expression can be added. */ #define CGROUP_HAS_SUBSYS_CONFIG (CGROUP_SUBSYS_COUNT > 0) /* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. * * css_set_lock protects task->cgroups pointer, the list of css_set * objects, and the chain of tasks off each css_set. * * These locks are exported if CONFIG_PROVE_RCU so that accessors in * cgroup.h can use them for lockdep annotations. */ DEFINE_MUTEX(cgroup_mutex); DEFINE_SPINLOCK(css_set_lock); #ifdef CONFIG_PROVE_RCU EXPORT_SYMBOL_GPL(cgroup_mutex); EXPORT_SYMBOL_GPL(css_set_lock); #endif DEFINE_SPINLOCK(trace_cgroup_path_lock); char trace_cgroup_path[TRACE_CGROUP_PATH_LEN]; static bool cgroup_debug __read_mostly; /* * Protects cgroup_idr and css_idr so that IDs can be released without * grabbing cgroup_mutex. */ static DEFINE_SPINLOCK(cgroup_idr_lock); /* * Protects cgroup_file->kn for !self csses. It synchronizes notifications * against file removal/re-creation across css hiding. */ static DEFINE_SPINLOCK(cgroup_file_kn_lock); DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem); #define cgroup_assert_mutex_or_rcu_locked() \ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ !lockdep_is_held(&cgroup_mutex), \ "cgroup_mutex or RCU read lock required"); /* * cgroup destruction makes heavy use of work items and there can be a lot * of concurrent destructions. Use a separate workqueue so that cgroup * destruction work items don't end up filling up max_active of system_wq * which may lead to deadlock. */ static struct workqueue_struct *cgroup_destroy_wq; /* generate an array of cgroup subsystem pointers */ #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys, struct cgroup_subsys *cgroup_subsys[] = { #include <linux/cgroup_subsys.h> }; #undef SUBSYS /* array of cgroup subsystem names */ #define SUBSYS(_x) [_x ## _cgrp_id] = #_x, static const char *cgroup_subsys_name[] = { #include <linux/cgroup_subsys.h> }; #undef SUBSYS /* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */ #define SUBSYS(_x) \ DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key); \ DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key); \ EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key); \ EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key); #include <linux/cgroup_subsys.h> #undef SUBSYS #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key, static struct static_key_true *cgroup_subsys_enabled_key[] = { #include <linux/cgroup_subsys.h> }; #undef SUBSYS #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key, static struct static_key_true *cgroup_subsys_on_dfl_key[] = { #include <linux/cgroup_subsys.h> }; #undef SUBSYS static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu); /* the default hierarchy */ struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu }; EXPORT_SYMBOL_GPL(cgrp_dfl_root); /* * The default hierarchy always exists but is hidden until mounted for the * first time. This is for backward compatibility. */ static bool cgrp_dfl_visible; /* some controllers are not supported in the default hierarchy */ static u16 cgrp_dfl_inhibit_ss_mask; /* some controllers are implicitly enabled on the default hierarchy */ static u16 cgrp_dfl_implicit_ss_mask; /* some controllers can be threaded on the default hierarchy */ static u16 cgrp_dfl_threaded_ss_mask; /* The list of hierarchy roots */ LIST_HEAD(cgroup_roots); static int cgroup_root_count; /* hierarchy ID allocation and mapping, protected by cgroup_mutex */ static DEFINE_IDR(cgroup_hierarchy_idr); /* * Assign a monotonically increasing serial number to csses. It guarantees * cgroups with bigger numbers are newer than those with smaller numbers. * Also, as csses are always appended to the parent's ->children list, it * guarantees that sibling csses are always sorted in the ascending serial * number order on the list. Protected by cgroup_mutex. */ static u64 css_serial_nr_next = 1; /* * These bitmasks identify subsystems with specific features to avoid * having to do iterative checks repeatedly. */ static u16 have_fork_callback __read_mostly; static u16 have_exit_callback __read_mostly; static u16 have_release_callback __read_mostly; static u16 have_canfork_callback __read_mostly; /* cgroup namespace for init task */ struct cgroup_namespace init_cgroup_ns = { .ns.count = REFCOUNT_INIT(2), .user_ns = &init_user_ns, .ns.ops = &cgroupns_operations, .ns.inum = PROC_CGROUP_INIT_INO, .root_cset = &init_css_set, }; static struct file_system_type cgroup2_fs_type; static struct cftype cgroup_base_files[]; static struct cftype cgroup_psi_files[]; /* cgroup optional features */ enum cgroup_opt_features { #ifdef CONFIG_PSI OPT_FEATURE_PRESSURE, #endif OPT_FEATURE_COUNT }; static const char *cgroup_opt_feature_names[OPT_FEATURE_COUNT] = { #ifdef CONFIG_PSI "pressure", #endif }; static u16 cgroup_feature_disable_mask __read_mostly; static int cgroup_apply_control(struct cgroup *cgrp); static void cgroup_finalize_control(struct cgroup *cgrp, int ret); static void css_task_iter_skip(struct css_task_iter *it, struct task_struct *task); static int cgroup_destroy_locked(struct cgroup *cgrp); static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, struct cgroup_subsys *ss); static void css_release(struct percpu_ref *ref); static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype cfts[], bool is_add); #ifdef CONFIG_DEBUG_CGROUP_REF #define CGROUP_REF_FN_ATTRS noinline #define CGROUP_REF_EXPORT(fn) EXPORT_SYMBOL_GPL(fn); #include <linux/cgroup_refcnt.h> #endif /** * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID * @ssid: subsys ID of interest * * cgroup_subsys_enabled() can only be used with literal subsys names which * is fine for individual subsystems but unsuitable for cgroup core. This * is slower static_key_enabled() based test indexed by @ssid. */ bool cgroup_ssid_enabled(int ssid) { if (!CGROUP_HAS_SUBSYS_CONFIG) return false; return static_key_enabled(cgroup_subsys_enabled_key[ssid]); } /** * cgroup_on_dfl - test whether a cgroup is on the default hierarchy * @cgrp: the cgroup of interest * * The default hierarchy is the v2 interface of cgroup and this function * can be used to test whether a cgroup is on the default hierarchy for * cases where a subsystem should behave differently depending on the * interface version. * * List of changed behaviors: * * - Mount options "noprefix", "xattr", "clone_children", "release_agent" * and "name" are disallowed. * * - When mounting an existing superblock, mount options should match. * * - rename(2) is disallowed. * * - "tasks" is removed. Everything should be at process granularity. Use * "cgroup.procs" instead. * * - "cgroup.procs" is not sorted. pids will be unique unless they got * recycled in-between reads. * * - "release_agent" and "notify_on_release" are removed. Replacement * notification mechanism will be implemented. * * - "cgroup.clone_children" is removed. * * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup * and its descendants contain no task; otherwise, 1. The file also * generates kernfs notification which can be monitored through poll and * [di]notify when the value of the file changes. * * - cpuset: tasks will be kept in empty cpusets when hotplug happens and * take masks of ancestors with non-empty cpus/mems, instead of being * moved to an ancestor. * * - cpuset: a task can be moved into an empty cpuset, and again it takes * masks of ancestors. * * - blkcg: blk-throttle becomes properly hierarchical. */ bool cgroup_on_dfl(const struct cgroup *cgrp) { return cgrp->root == &cgrp_dfl_root; } /* IDR wrappers which synchronize using cgroup_idr_lock */ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp_mask) { int ret; idr_preload(gfp_mask); spin_lock_bh(&cgroup_idr_lock); ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM); spin_unlock_bh(&cgroup_idr_lock); idr_preload_end(); return ret; } static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id) { void *ret; spin_lock_bh(&cgroup_idr_lock); ret = idr_replace(idr, ptr, id); spin_unlock_bh(&cgroup_idr_lock); return ret; } static void cgroup_idr_remove(struct idr *idr, int id) { spin_lock_bh(&cgroup_idr_lock); idr_remove(idr, id); spin_unlock_bh(&cgroup_idr_lock); } static bool cgroup_has_tasks(struct cgroup *cgrp) { return cgrp->nr_populated_csets; } static bool cgroup_is_threaded(struct cgroup *cgrp) { return cgrp->dom_cgrp != cgrp; } /* can @cgrp host both domain and threaded children? */ static bool cgroup_is_mixable(struct cgroup *cgrp) { /* * Root isn't under domain level resource control exempting it from * the no-internal-process constraint, so it can serve as a thread * root and a parent of resource domains at the same time. */ return !cgroup_parent(cgrp); } /* can @cgrp become a thread root? Should always be true for a thread root */ static bool cgroup_can_be_thread_root(struct cgroup *cgrp) { /* mixables don't care */ if (cgroup_is_mixable(cgrp)) return true; /* domain roots can't be nested under threaded */ if (cgroup_is_threaded(cgrp)) return false; /* can only have either domain or threaded children */ if (cgrp->nr_populated_domain_children) return false; /* and no domain controllers can be enabled */ if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask) return false; return true; } /* is @cgrp root of a threaded subtree? */ static bool cgroup_is_thread_root(struct cgroup *cgrp) { /* thread root should be a domain */ if (cgroup_is_threaded(cgrp)) return false; /* a domain w/ threaded children is a thread root */ if (cgrp->nr_threaded_children) return true; /* * A domain which has tasks and explicit threaded controllers * enabled is a thread root. */ if (cgroup_has_tasks(cgrp) && (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask)) return true; return false; } /* a domain which isn't connected to the root w/o brekage can't be used */ static bool cgroup_is_valid_domain(struct cgroup *cgrp) { /* the cgroup itself can be a thread root */ if (cgroup_is_threaded(cgrp)) return false; /* but the ancestors can't be unless mixable */ while ((cgrp = cgroup_parent(cgrp))) { if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp)) return false; if (cgroup_is_threaded(cgrp)) return false; } return true; } /* subsystems visibly enabled on a cgroup */ static u16 cgroup_control(struct cgroup *cgrp) { struct cgroup *parent = cgroup_parent(cgrp); u16 root_ss_mask = cgrp->root->subsys_mask; if (parent) { u16 ss_mask = parent->subtree_control; /* threaded cgroups can only have threaded controllers */ if (cgroup_is_threaded(cgrp)) ss_mask &= cgrp_dfl_threaded_ss_mask; return ss_mask; } if (cgroup_on_dfl(cgrp)) root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask | cgrp_dfl_implicit_ss_mask); return root_ss_mask; } /* subsystems enabled on a cgroup */ static u16 cgroup_ss_mask(struct cgroup *cgrp) { struct cgroup *parent = cgroup_parent(cgrp); if (parent) { u16 ss_mask = parent->subtree_ss_mask; /* threaded cgroups can only have threaded controllers */ if (cgroup_is_threaded(cgrp)) ss_mask &= cgrp_dfl_threaded_ss_mask; return ss_mask; } return cgrp->root->subsys_mask; } /** * cgroup_css - obtain a cgroup's css for the specified subsystem * @cgrp: the cgroup of interest * @ss: the subsystem of interest (%NULL returns @cgrp->self) * * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This * function must be called either under cgroup_mutex or rcu_read_lock() and * the caller is responsible for pinning the returned css if it wants to * keep accessing it outside the said locks. This function may return * %NULL if @cgrp doesn't have @subsys_id enabled. */ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { if (CGROUP_HAS_SUBSYS_CONFIG && ss) return rcu_dereference_check(cgrp->subsys[ss->id], lockdep_is_held(&cgroup_mutex)); else return &cgrp->self; } /** * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss * @cgrp: the cgroup of interest * @ss: the subsystem of interest (%NULL returns @cgrp->self) * * Similar to cgroup_css() but returns the effective css, which is defined * as the matching css of the nearest ancestor including self which has @ss * enabled. If @ss is associated with the hierarchy @cgrp is on, this * function is guaranteed to return non-NULL css. */ static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp, struct cgroup_subsys *ss) { lockdep_assert_held(&cgroup_mutex); if (!ss) return &cgrp->self; /* * This function is used while updating css associations and thus * can't test the csses directly. Test ss_mask. */ while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) { cgrp = cgroup_parent(cgrp); if (!cgrp) return NULL; } return cgroup_css(cgrp, ss); } /** * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem * @cgrp: the cgroup of interest * @ss: the subsystem of interest * * Find and get the effective css of @cgrp for @ss. The effective css is * defined as the matching css of the nearest ancestor including self which * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on, * the root css is returned, so this function always returns a valid css. * * The returned css is not guaranteed to be online, and therefore it is the * callers responsibility to try get a reference for it. */ struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { struct cgroup_subsys_state *css; if (!CGROUP_HAS_SUBSYS_CONFIG) return NULL; do { css = cgroup_css(cgrp, ss); if (css) return css; cgrp = cgroup_parent(cgrp); } while (cgrp); return init_css_set.subsys[ss->id]; } /** * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem * @cgrp: the cgroup of interest * @ss: the subsystem of interest * * Find and get the effective css of @cgrp for @ss. The effective css is * defined as the matching css of the nearest ancestor including self which * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on, * the root css is returned, so this function always returns a valid css. * The returned css must be put using css_put(). */ struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { struct cgroup_subsys_state *css; if (!CGROUP_HAS_SUBSYS_CONFIG) return NULL; rcu_read_lock(); do { css = cgroup_css(cgrp, ss); if (css && css_tryget_online(css)) goto out_unlock; cgrp = cgroup_parent(cgrp); } while (cgrp); css = init_css_set.subsys[ss->id]; css_get(css); out_unlock: rcu_read_unlock(); return css; } EXPORT_SYMBOL_GPL(cgroup_get_e_css); static void cgroup_get_live(struct cgroup *cgrp) { WARN_ON_ONCE(cgroup_is_dead(cgrp)); cgroup_get(cgrp); } /** * __cgroup_task_count - count the number of tasks in a cgroup. The caller * is responsible for taking the css_set_lock. * @cgrp: the cgroup in question */ int __cgroup_task_count(const struct cgroup *cgrp) { int count = 0; struct cgrp_cset_link *link; lockdep_assert_held(&css_set_lock); list_for_each_entry(link, &cgrp->cset_links, cset_link) count += link->cset->nr_tasks; return count; } /** * cgroup_task_count - count the number of tasks in a cgroup. * @cgrp: the cgroup in question */ int cgroup_task_count(const struct cgroup *cgrp) { int count; spin_lock_irq(&css_set_lock); count = __cgroup_task_count(cgrp); spin_unlock_irq(&css_set_lock); return count; } struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) { struct cgroup *cgrp = of->kn->parent->priv; struct cftype *cft = of_cft(of); /* * This is open and unprotected implementation of cgroup_css(). * seq_css() is only called from a kernfs file operation which has * an active reference on the file. Because all the subsystem * files are drained before a css is disassociated with a cgroup, * the matching css from the cgroup's subsys table is guaranteed to * be and stay valid until the enclosing operation is complete. */ if (CGROUP_HAS_SUBSYS_CONFIG && cft->ss) return rcu_dereference_raw(cgrp->subsys[cft->ss->id]); else return &cgrp->self; } EXPORT_SYMBOL_GPL(of_css); /** * for_each_css - iterate all css's of a cgroup * @css: the iteration cursor * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end * @cgrp: the target cgroup to iterate css's of * * Should be called under cgroup_mutex. */ #define for_each_css(css, ssid, cgrp) \ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ if (!((css) = rcu_dereference_check( \ (cgrp)->subsys[(ssid)], \ lockdep_is_held(&cgroup_mutex)))) { } \ else /** * do_each_subsys_mask - filter for_each_subsys with a bitmask * @ss: the iteration cursor * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end * @ss_mask: the bitmask * * The block will only run for cases where the ssid-th bit (1 << ssid) of * @ss_mask is set. */ #define do_each_subsys_mask(ss, ssid, ss_mask) do { \ unsigned long __ss_mask = (ss_mask); \ if (!CGROUP_HAS_SUBSYS_CONFIG) { \ (ssid) = 0; \ break; \ } \ for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) { \ (ss) = cgroup_subsys[ssid]; \ { #define while_each_subsys_mask() \ } \ } \ } while (false) /* iterate over child cgrps, lock should be held throughout iteration */ #define cgroup_for_each_live_child(child, cgrp) \ list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \ if (({ lockdep_assert_held(&cgroup_mutex); \ cgroup_is_dead(child); })) \ ; \ else /* walk live descendants in pre order */ #define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \ css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \ if (({ lockdep_assert_held(&cgroup_mutex); \ (dsct) = (d_css)->cgroup; \ cgroup_is_dead(dsct); })) \ ; \ else /* walk live descendants in postorder */ #define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \ css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \ if (({ lockdep_assert_held(&cgroup_mutex); \ (dsct) = (d_css)->cgroup; \ cgroup_is_dead(dsct); })) \ ; \ else /* * The default css_set - used by init and its children prior to any * hierarchies being mounted. It contains a pointer to the root state * for each subsystem. Also used to anchor the list of css_sets. Not * reference-counted, to improve performance when child cgroups * haven't been created. */ struct css_set init_css_set = { .refcount = REFCOUNT_INIT(1), .dom_cset = &init_css_set, .tasks = LIST_HEAD_INIT(init_css_set.tasks), .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks), .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets), .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), .mg_src_preload_node = LIST_HEAD_INIT(init_css_set.mg_src_preload_node), .mg_dst_preload_node = LIST_HEAD_INIT(init_css_set.mg_dst_preload_node), .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), /* * The following field is re-initialized when this cset gets linked * in cgroup_init(). However, let's initialize the field * statically too so that the default cgroup can be accessed safely * early during boot. */ .dfl_cgrp = &cgrp_dfl_root.cgrp, }; static int css_set_count = 1; /* 1 for init_css_set */ static bool css_set_threaded(struct css_set *cset) { return cset->dom_cset != cset; } /** * css_set_populated - does a css_set contain any tasks? * @cset: target css_set * * css_set_populated() should be the same as !!cset->nr_tasks at steady * state. However, css_set_populated() can be called while a task is being * added to or removed from the linked list before the nr_tasks is * properly updated. Hence, we can't just look at ->nr_tasks here. */ static bool css_set_populated(struct css_set *cset) { lockdep_assert_held(&css_set_lock); return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks); } /** * cgroup_update_populated - update the populated count of a cgroup * @cgrp: the target cgroup * @populated: inc or dec populated count * * One of the css_sets associated with @cgrp is either getting its first * task or losing the last. Update @cgrp->nr_populated_* accordingly. The * count is propagated towards root so that a given cgroup's * nr_populated_children is zero iff none of its descendants contain any * tasks. * * @cgrp's interface file "cgroup.populated" is zero if both * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and * 1 otherwise. When the sum changes from or to zero, userland is notified * that the content of the interface file has changed. This can be used to * detect when @cgrp and its descendants become populated or empty. */ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) { struct cgroup *child = NULL; int adj = populated ? 1 : -1; lockdep_assert_held(&css_set_lock); do { bool was_populated = cgroup_is_populated(cgrp); if (!child) { cgrp->nr_populated_csets += adj; } else { if (cgroup_is_threaded(child)) cgrp->nr_populated_threaded_children += adj; else cgrp->nr_populated_domain_children += adj; } if (was_populated == cgroup_is_populated(cgrp)) break; cgroup1_check_for_release(cgrp); TRACE_CGROUP_PATH(notify_populated, cgrp, cgroup_is_populated(cgrp)); cgroup_file_notify(&cgrp->events_file); child = cgrp; cgrp = cgroup_parent(cgrp); } while (cgrp); } /** * css_set_update_populated - update populated state of a css_set * @cset: target css_set * @populated: whether @cset is populated or depopulated * * @cset is either getting the first task or losing the last. Update the * populated counters of all associated cgroups accordingly. */ static void css_set_update_populated(struct css_set *cset, bool populated) { struct cgrp_cset_link *link; lockdep_assert_held(&css_set_lock); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) cgroup_update_populated(link->cgrp, populated); } /* * @task is leaving, advance task iterators which are pointing to it so * that they can resume at the next position. Advancing an iterator might * remove it from the list, use safe walk. See css_task_iter_skip() for * details. */ static void css_set_skip_task_iters(struct css_set *cset, struct task_struct *task) { struct css_task_iter *it, *pos; list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node) css_task_iter_skip(it, task); } /** * css_set_move_task - move a task from one css_set to another * @task: task being moved * @from_cset: css_set @task currently belongs to (may be NULL) * @to_cset: new css_set @task is being moved to (may be NULL) * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks * * Move @task from @from_cset to @to_cset. If @task didn't belong to any * css_set, @from_cset can be NULL. If @task is being disassociated * instead of moved, @to_cset can be NULL. * * This function automatically handles populated counter updates and * css_task_iter adjustments but the caller is responsible for managing * @from_cset and @to_cset's reference counts. */ static void css_set_move_task(struct task_struct *task, struct css_set *from_cset, struct css_set *to_cset, bool use_mg_tasks) { lockdep_assert_held(&css_set_lock); if (to_cset && !css_set_populated(to_cset)) css_set_update_populated(to_cset, true); if (from_cset) { WARN_ON_ONCE(list_empty(&task->cg_list)); css_set_skip_task_iters(from_cset, task); list_del_init(&task->cg_list); if (!css_set_populated(from_cset)) css_set_update_populated(from_cset, false); } else { WARN_ON_ONCE(!list_empty(&task->cg_list)); } if (to_cset) { /* * We are synchronized through cgroup_threadgroup_rwsem * against PF_EXITING setting such that we can't race * against cgroup_exit()/cgroup_free() dropping the css_set. */ WARN_ON_ONCE(task->flags & PF_EXITING); cgroup_move_task(task, to_cset); list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks : &to_cset->tasks); } } /* * hash table for cgroup groups. This improves the performance to find * an existing css_set. This hash doesn't (currently) take into * account cgroups in empty hierarchies. */ #define CSS_SET_HASH_BITS 7 static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS); static unsigned long css_set_hash(struct cgroup_subsys_state **css) { unsigned long key = 0UL; struct cgroup_subsys *ss; int i; for_each_subsys(ss, i) key += (unsigned long)css[i]; key = (key >> 16) ^ key; return key; } void put_css_set_locked(struct css_set *cset) { struct cgrp_cset_link *link, *tmp_link; struct cgroup_subsys *ss; int ssid; lockdep_assert_held(&css_set_lock); if (!refcount_dec_and_test(&cset->refcount)) return; WARN_ON_ONCE(!list_empty(&cset->threaded_csets)); /* This css_set is dead. Unlink it and release cgroup and css refs */ for_each_subsys(ss, ssid) { list_del(&cset->e_cset_node[ssid]); css_put(cset->subsys[ssid]); } hash_del(&cset->hlist); css_set_count--; list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) { list_del(&link->cset_link); list_del(&link->cgrp_link); if (cgroup_parent(link->cgrp)) cgroup_put(link->cgrp); kfree(link); } if (css_set_threaded(cset)) { list_del(&cset->threaded_csets_node); put_css_set_locked(cset->dom_cset); } kfree_rcu(cset, rcu_head); } /** * compare_css_sets - helper function for find_existing_css_set(). * @cset: candidate css_set being tested * @old_cset: existing css_set for a task * @new_cgrp: cgroup that's being entered by the task * @template: desired set of css pointers in css_set (pre-calculated) * * Returns true if "cset" matches "old_cset" except for the hierarchy * which "new_cgrp" belongs to, for which it should match "new_cgrp". */ static bool compare_css_sets(struct css_set *cset, struct css_set *old_cset, struct cgroup *new_cgrp, struct cgroup_subsys_state *template[]) { struct cgroup *new_dfl_cgrp; struct list_head *l1, *l2; /* * On the default hierarchy, there can be csets which are * associated with the same set of cgroups but different csses. * Let's first ensure that csses match. */ if (memcmp(template, cset->subsys, sizeof(cset->subsys))) return false; /* @cset's domain should match the default cgroup's */ if (cgroup_on_dfl(new_cgrp)) new_dfl_cgrp = new_cgrp; else new_dfl_cgrp = old_cset->dfl_cgrp; if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp) return false; /* * Compare cgroup pointers in order to distinguish between * different cgroups in hierarchies. As different cgroups may * share the same effective css, this comparison is always * necessary. */ l1 = &cset->cgrp_links; l2 = &old_cset->cgrp_links; while (1) { struct cgrp_cset_link *link1, *link2; struct cgroup *cgrp1, *cgrp2; l1 = l1->next; l2 = l2->next; /* See if we reached the end - both lists are equal length. */ if (l1 == &cset->cgrp_links) { BUG_ON(l2 != &old_cset->cgrp_links); break; } else { BUG_ON(l2 == &old_cset->cgrp_links); } /* Locate the cgroups associated with these links. */ link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link); link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link); cgrp1 = link1->cgrp; cgrp2 = link2->cgrp; /* Hierarchies should be linked in the same order. */ BUG_ON(cgrp1->root != cgrp2->root); /* * If this hierarchy is the hierarchy of the cgroup * that's changing, then we need to check that this * css_set points to the new cgroup; if it's any other * hierarchy, then this css_set should point to the * same cgroup as the old css_set. */ if (cgrp1->root == new_cgrp->root) { if (cgrp1 != new_cgrp) return false; } else { if (cgrp1 != cgrp2) return false; } } return true; } /** * find_existing_css_set - init css array and find the matching css_set * @old_cset: the css_set that we're using before the cgroup transition * @cgrp: the cgroup that we're moving into * @template: out param for the new set of csses, should be clear on entry */ static struct css_set *find_existing_css_set(struct css_set *old_cset, struct cgroup *cgrp, struct cgroup_subsys_state **template) { struct cgroup_root *root = cgrp->root; struct cgroup_subsys *ss; struct css_set *cset; unsigned long key; int i; /* * Build the set of subsystem state objects that we want to see in the * new css_set. While subsystems can change globally, the entries here * won't change, so no need for locking. */ for_each_subsys(ss, i) { if (root->subsys_mask & (1UL << i)) { /* * @ss is in this hierarchy, so we want the * effective css from @cgrp. */ template[i] = cgroup_e_css_by_mask(cgrp, ss); } else { /* * @ss is not in this hierarchy, so we don't want * to change the css. */ template[i] = old_cset->subsys[i]; } } key = css_set_hash(template); hash_for_each_possible(css_set_table, cset, hlist, key) { if (!compare_css_sets(cset, old_cset, cgrp, template)) continue; /* This css_set matches what we need */ return cset; } /* No existing cgroup group matched */ return NULL; } static void free_cgrp_cset_links(struct list_head *links_to_free) { struct cgrp_cset_link *link, *tmp_link; list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) { list_del(&link->cset_link); kfree(link); } } /** * allocate_cgrp_cset_links - allocate cgrp_cset_links * @count: the number of links to allocate * @tmp_links: list_head the allocated links are put on * * Allocate @count cgrp_cset_link structures and chain them on @tmp_links * through ->cset_link. Returns 0 on success or -errno. */ static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links) { struct cgrp_cset_link *link; int i; INIT_LIST_HEAD(tmp_links); for (i = 0; i < count; i++) { link = kzalloc(sizeof(*link), GFP_KERNEL); if (!link) { free_cgrp_cset_links(tmp_links); return -ENOMEM; } list_add(&link->cset_link, tmp_links); } return 0; } /** * link_css_set - a helper function to link a css_set to a cgroup * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links() * @cset: the css_set to be linked * @cgrp: the destination cgroup */ static void link_css_set(struct list_head *tmp_links, struct css_set *cset, struct cgroup *cgrp) { struct cgrp_cset_link *link; BUG_ON(list_empty(tmp_links)); if (cgroup_on_dfl(cgrp)) cset->dfl_cgrp = cgrp; link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link); link->cset = cset; link->cgrp = cgrp; /* * Always add links to the tail of the lists so that the lists are * in chronological order. */ list_move_tail(&link->cset_link, &cgrp->cset_links); list_add_tail(&link->cgrp_link, &cset->cgrp_links); if (cgroup_parent(cgrp)) cgroup_get_live(cgrp); } /** * find_css_set - return a new css_set with one cgroup updated * @old_cset: the baseline css_set * @cgrp: the cgroup to be updated * * Return a new css_set that's equivalent to @old_cset, but with @cgrp * substituted into the appropriate hierarchy. */ static struct css_set *find_css_set(struct css_set *old_cset, struct cgroup *cgrp) { struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { }; struct css_set *cset; struct list_head tmp_links; struct cgrp_cset_link *link; struct cgroup_subsys *ss; unsigned long key; int ssid; lockdep_assert_held(&cgroup_mutex); /* First see if we already have a cgroup group that matches * the desired set */ spin_lock_irq(&css_set_lock); cset = find_existing_css_set(old_cset, cgrp, template); if (cset) get_css_set(cset); spin_unlock_irq(&css_set_lock); if (cset) return cset; cset = kzalloc(sizeof(*cset), GFP_KERNEL); if (!cset) return NULL; /* Allocate all the cgrp_cset_link objects that we'll need */ if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) { kfree(cset); return NULL; } refcount_set(&cset->refcount, 1); cset->dom_cset = cset; INIT_LIST_HEAD(&cset->tasks); INIT_LIST_HEAD(&cset->mg_tasks); INIT_LIST_HEAD(&cset->dying_tasks); INIT_LIST_HEAD(&cset->task_iters); INIT_LIST_HEAD(&cset->threaded_csets); INIT_HLIST_NODE(&cset->hlist); INIT_LIST_HEAD(&cset->cgrp_links); INIT_LIST_HEAD(&cset->mg_src_preload_node); INIT_LIST_HEAD(&cset->mg_dst_preload_node); INIT_LIST_HEAD(&cset->mg_node); /* Copy the set of subsystem state objects generated in * find_existing_css_set() */ memcpy(cset->subsys, template, sizeof(cset->subsys)); spin_lock_irq(&css_set_lock); /* Add reference counts and links from the new css_set. */ list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; if (c->root == cgrp->root) c = cgrp; link_css_set(&tmp_links, cset, c); } BUG_ON(!list_empty(&tmp_links)); css_set_count++; /* Add @cset to the hash table */ key = css_set_hash(cset->subsys); hash_add(css_set_table, &cset->hlist, key); for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cset->subsys[ssid]; list_add_tail(&cset->e_cset_node[ssid], &css->cgroup->e_csets[ssid]); css_get(css); } spin_unlock_irq(&css_set_lock); /* * If @cset should be threaded, look up the matching dom_cset and * link them up. We first fully initialize @cset then look for the * dom_cset. It's simpler this way and safe as @cset is guaranteed * to stay empty until we return. */ if (cgroup_is_threaded(cset->dfl_cgrp)) { struct css_set *dcset; dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp); if (!dcset) { put_css_set(cset); return NULL; } spin_lock_irq(&css_set_lock); cset->dom_cset = dcset; list_add_tail(&cset->threaded_csets_node, &dcset->threaded_csets); spin_unlock_irq(&css_set_lock); } return cset; } struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) { struct cgroup *root_cgrp = kernfs_root_to_node(kf_root)->priv; return root_cgrp->root; } void cgroup_favor_dynmods(struct cgroup_root *root, bool favor) { bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS; /* see the comment above CGRP_ROOT_FAVOR_DYNMODS definition */ if (favor && !favoring) { rcu_sync_enter(&cgroup_threadgroup_rwsem.rss); root->flags |= CGRP_ROOT_FAVOR_DYNMODS; } else if (!favor && favoring) { rcu_sync_exit(&cgroup_threadgroup_rwsem.rss); root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS; } } static int cgroup_init_root_id(struct cgroup_root *root) { int id; lockdep_assert_held(&cgroup_mutex); id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL); if (id < 0) return id; root->hierarchy_id = id; return 0; } static void cgroup_exit_root_id(struct cgroup_root *root) { lockdep_assert_held(&cgroup_mutex); idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); } void cgroup_free_root(struct cgroup_root *root) { kfree(root); } static void cgroup_destroy_root(struct cgroup_root *root) { struct cgroup *cgrp = &root->cgrp; struct cgrp_cset_link *link, *tmp_link; trace_cgroup_destroy_root(root); cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); BUG_ON(atomic_read(&root->nr_cgrps)); BUG_ON(!list_empty(&cgrp->self.children)); /* Rebind all subsystems back to the default hierarchy */ WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask)); /* * Release all the links from cset_links to this hierarchy's * root cgroup */ spin_lock_irq(&css_set_lock); list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { list_del(&link->cset_link); list_del(&link->cgrp_link); kfree(link); } spin_unlock_irq(&css_set_lock); if (!list_empty(&root->root_list)) { list_del(&root->root_list); cgroup_root_count--; } cgroup_favor_dynmods(root, false); cgroup_exit_root_id(root); cgroup_unlock(); cgroup_rstat_exit(cgrp); kernfs_destroy_root(root->kf_root); cgroup_free_root(root); } /* * Returned cgroup is without refcount but it's valid as long as cset pins it. */ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset, struct cgroup_root *root) { struct cgroup *res_cgroup = NULL; if (cset == &init_css_set) { res_cgroup = &root->cgrp; } else if (root == &cgrp_dfl_root) { res_cgroup = cset->dfl_cgrp; } else { struct cgrp_cset_link *link; lockdep_assert_held(&css_set_lock); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; if (c->root == root) { res_cgroup = c; break; } } } BUG_ON(!res_cgroup); return res_cgroup; } /* * look up cgroup associated with current task's cgroup namespace on the * specified hierarchy */ static struct cgroup * current_cgns_cgroup_from_root(struct cgroup_root *root) { struct cgroup *res = NULL; struct css_set *cset; lockdep_assert_held(&css_set_lock); rcu_read_lock(); cset = current->nsproxy->cgroup_ns->root_cset; res = __cset_cgroup_from_root(cset, root); rcu_read_unlock(); return res; } /* * Look up cgroup associated with current task's cgroup namespace on the default * hierarchy. * * Unlike current_cgns_cgroup_from_root(), this doesn't need locks: * - Internal rcu_read_lock is unnecessary because we don't dereference any rcu * pointers. * - css_set_lock is not needed because we just read cset->dfl_cgrp. * - As a bonus returned cgrp is pinned with the current because it cannot * switch cgroup_ns asynchronously. */ static struct cgroup *current_cgns_cgroup_dfl(void) { struct css_set *cset; if (current->nsproxy) { cset = current->nsproxy->cgroup_ns->root_cset; return __cset_cgroup_from_root(cset, &cgrp_dfl_root); } else { /* * NOTE: This function may be called from bpf_cgroup_from_id() * on a task which has already passed exit_task_namespaces() and * nsproxy == NULL. Fall back to cgrp_dfl_root which will make all * cgroups visible for lookups. */ return &cgrp_dfl_root.cgrp; } } /* look up cgroup associated with given css_set on the specified hierarchy */ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, struct cgroup_root *root) { lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&css_set_lock); return __cset_cgroup_from_root(cset, root); } /* * Return the cgroup for "task" from the given hierarchy. Must be * called with cgroup_mutex and css_set_lock held. */ struct cgroup *task_cgroup_from_root(struct task_struct *task, struct cgroup_root *root) { /* * No need to lock the task - since we hold css_set_lock the * task can't change groups. */ return cset_cgroup_from_root(task_css_set(task), root); } /* * A task must hold cgroup_mutex to modify cgroups. * * Any task can increment and decrement the count field without lock. * So in general, code holding cgroup_mutex can't rely on the count * field not changing. However, if the count goes to zero, then only * cgroup_attach_task() can increment it again. Because a count of zero * means that no tasks are currently attached, therefore there is no * way a task attached to that cgroup can fork (the other way to * increment the count). So code holding cgroup_mutex can safely * assume that if the count is zero, it will stay zero. Similarly, if * a task holds cgroup_mutex on a cgroup with zero count, it * knows that the cgroup won't be removed, as cgroup_rmdir() * needs that mutex. * * A cgroup can only be deleted if both its 'count' of using tasks * is zero, and its list of 'children' cgroups is empty. Since all * tasks in the system use _some_ cgroup, and since there is always at * least one task in the system (init, pid == 1), therefore, root cgroup * always has either children cgroups and/or using tasks. So we don't * need a special hack to ensure that root cgroup cannot be deleted. * * P.S. One more locking exception. RCU is used to guard the * update of a tasks cgroup pointer by cgroup_attach_task() */ static struct kernfs_syscall_ops cgroup_kf_syscall_ops; static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, char *buf) { struct cgroup_subsys *ss = cft->ss; if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { const char *dbg = (cft->flags & CFTYPE_DEBUG) ? ".__DEBUG__." : ""; snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s", dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, cft->name); } else { strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX); } return buf; } /** * cgroup_file_mode - deduce file mode of a control file * @cft: the control file in question * * S_IRUGO for read, S_IWUSR for write. */ static umode_t cgroup_file_mode(const struct cftype *cft) { umode_t mode = 0; if (cft->read_u64 || cft->read_s64 || cft->seq_show) mode |= S_IRUGO; if (cft->write_u64 || cft->write_s64 || cft->write) { if (cft->flags & CFTYPE_WORLD_WRITABLE) mode |= S_IWUGO; else mode |= S_IWUSR; } return mode; } /** * cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask * @subtree_control: the new subtree_control mask to consider * @this_ss_mask: available subsystems * * On the default hierarchy, a subsystem may request other subsystems to be * enabled together through its ->depends_on mask. In such cases, more * subsystems than specified in "cgroup.subtree_control" may be enabled. * * This function calculates which subsystems need to be enabled if * @subtree_control is to be applied while restricted to @this_ss_mask. */ static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask) { u16 cur_ss_mask = subtree_control; struct cgroup_subsys *ss; int ssid; lockdep_assert_held(&cgroup_mutex); cur_ss_mask |= cgrp_dfl_implicit_ss_mask; while (true) { u16 new_ss_mask = cur_ss_mask; do_each_subsys_mask(ss, ssid, cur_ss_mask) { new_ss_mask |= ss->depends_on; } while_each_subsys_mask(); /* * Mask out subsystems which aren't available. This can * happen only if some depended-upon subsystems were bound * to non-default hierarchies. */ new_ss_mask &= this_ss_mask; if (new_ss_mask == cur_ss_mask) break; cur_ss_mask = new_ss_mask; } return cur_ss_mask; } /** * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods * @kn: the kernfs_node being serviced * * This helper undoes cgroup_kn_lock_live() and should be invoked before * the method finishes if locking succeeded. Note that once this function * returns the cgroup returned by cgroup_kn_lock_live() may become * inaccessible any time. If the caller intends to continue to access the * cgroup, it should pin it before invoking this function. */ void cgroup_kn_unlock(struct kernfs_node *kn) { struct cgroup *cgrp; if (kernfs_type(kn) == KERNFS_DIR) cgrp = kn->priv; else cgrp = kn->parent->priv; cgroup_unlock(); kernfs_unbreak_active_protection(kn); cgroup_put(cgrp); } /** * cgroup_kn_lock_live - locking helper for cgroup kernfs methods * @kn: the kernfs_node being serviced * @drain_offline: perform offline draining on the cgroup * * This helper is to be used by a cgroup kernfs method currently servicing * @kn. It breaks the active protection, performs cgroup locking and * verifies that the associated cgroup is alive. Returns the cgroup if * alive; otherwise, %NULL. A successful return should be undone by a * matching cgroup_kn_unlock() invocation. If @drain_offline is %true, the * cgroup is drained of offlining csses before return. * * Any cgroup kernfs method implementation which requires locking the * associated cgroup should use this helper. It avoids nesting cgroup * locking under kernfs active protection and allows all kernfs operations * including self-removal. */ struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline) { struct cgroup *cgrp; if (kernfs_type(kn) == KERNFS_DIR) cgrp = kn->priv; else cgrp = kn->parent->priv; /* * We're gonna grab cgroup_mutex which nests outside kernfs * active_ref. cgroup liveliness check alone provides enough * protection against removal. Ensure @cgrp stays accessible and * break the active_ref protection. */ if (!cgroup_tryget(cgrp)) return NULL; kernfs_break_active_protection(kn); if (drain_offline) cgroup_lock_and_drain_offline(cgrp); else cgroup_lock(); if (!cgroup_is_dead(cgrp)) return cgrp; cgroup_kn_unlock(kn); return NULL; } static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) { char name[CGROUP_FILE_NAME_MAX]; lockdep_assert_held(&cgroup_mutex); if (cft->file_offset) { struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss); struct cgroup_file *cfile = (void *)css + cft->file_offset; spin_lock_irq(&cgroup_file_kn_lock); cfile->kn = NULL; spin_unlock_irq(&cgroup_file_kn_lock); del_timer_sync(&cfile->notify_timer); } kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); } /** * css_clear_dir - remove subsys files in a cgroup directory * @css: target css */ static void css_clear_dir(struct cgroup_subsys_state *css) { struct cgroup *cgrp = css->cgroup; struct cftype *cfts; if (!(css->flags & CSS_VISIBLE)) return; css->flags &= ~CSS_VISIBLE; if (!css->ss) { if (cgroup_on_dfl(cgrp)) { cgroup_addrm_files(css, cgrp, cgroup_base_files, false); if (cgroup_psi_enabled()) cgroup_addrm_files(css, cgrp, cgroup_psi_files, false); } else { cgroup_addrm_files(css, cgrp, cgroup1_base_files, false); } } else { list_for_each_entry(cfts, &css->ss->cfts, node) cgroup_addrm_files(css, cgrp, cfts, false); } } /** * css_populate_dir - create subsys files in a cgroup directory * @css: target css * * On failure, no file is added. */ static int css_populate_dir(struct cgroup_subsys_state *css) { struct cgroup *cgrp = css->cgroup; struct cftype *cfts, *failed_cfts; int ret; if (css->flags & CSS_VISIBLE) return 0; if (!css->ss) { if (cgroup_on_dfl(cgrp)) { ret = cgroup_addrm_files(&cgrp->self, cgrp, cgroup_base_files, true); if (ret < 0) return ret; if (cgroup_psi_enabled()) { ret = cgroup_addrm_files(&cgrp->self, cgrp, cgroup_psi_files, true); if (ret < 0) return ret; } } else { cgroup_addrm_files(css, cgrp, cgroup1_base_files, true); } } else { list_for_each_entry(cfts, &css->ss->cfts, node) { ret = cgroup_addrm_files(css, cgrp, cfts, true); if (ret < 0) { failed_cfts = cfts; goto err; } } } css->flags |= CSS_VISIBLE; return 0; err: list_for_each_entry(cfts, &css->ss->cfts, node) { if (cfts == failed_cfts) break; cgroup_addrm_files(css, cgrp, cfts, false); } return ret; } int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) { struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; int ssid, ret; u16 dfl_disable_ss_mask = 0; lockdep_assert_held(&cgroup_mutex); do_each_subsys_mask(ss, ssid, ss_mask) { /* * If @ss has non-root csses attached to it, can't move. * If @ss is an implicit controller, it is exempt from this * rule and can be stolen. */ if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) && !ss->implicit_on_dfl) return -EBUSY; /* can't move between two non-dummy roots either */ if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root) return -EBUSY; /* * Collect ssid's that need to be disabled from default * hierarchy. */ if (ss->root == &cgrp_dfl_root) dfl_disable_ss_mask |= 1 << ssid; } while_each_subsys_mask(); if (dfl_disable_ss_mask) { struct cgroup *scgrp = &cgrp_dfl_root.cgrp; /* * Controllers from default hierarchy that need to be rebound * are all disabled together in one go. */ cgrp_dfl_root.subsys_mask &= ~dfl_disable_ss_mask; WARN_ON(cgroup_apply_control(scgrp)); cgroup_finalize_control(scgrp, 0); } do_each_subsys_mask(ss, ssid, ss_mask) { struct cgroup_root *src_root = ss->root; struct cgroup *scgrp = &src_root->cgrp; struct cgroup_subsys_state *css = cgroup_css(scgrp, ss); struct css_set *cset, *cset_pos; struct css_task_iter *it; WARN_ON(!css || cgroup_css(dcgrp, ss)); if (src_root != &cgrp_dfl_root) { /* disable from the source */ src_root->subsys_mask &= ~(1 << ssid); WARN_ON(cgroup_apply_control(scgrp)); cgroup_finalize_control(scgrp, 0); } /* rebind */ RCU_INIT_POINTER(scgrp->subsys[ssid], NULL); rcu_assign_pointer(dcgrp->subsys[ssid], css); ss->root = dst_root; css->cgroup = dcgrp; spin_lock_irq(&css_set_lock); WARN_ON(!list_empty(&dcgrp->e_csets[ss->id])); list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id], e_cset_node[ss->id]) { list_move_tail(&cset->e_cset_node[ss->id], &dcgrp->e_csets[ss->id]); /* * all css_sets of scgrp together in same order to dcgrp, * patch in-flight iterators to preserve correct iteration. * since the iterator is always advanced right away and * finished when it->cset_pos meets it->cset_head, so only * update it->cset_head is enough here. */ list_for_each_entry(it, &cset->task_iters, iters_node) if (it->cset_head == &scgrp->e_csets[ss->id]) it->cset_head = &dcgrp->e_csets[ss->id]; } spin_unlock_irq(&css_set_lock); if (ss->css_rstat_flush) { list_del_rcu(&css->rstat_css_node); synchronize_rcu(); list_add_rcu(&css->rstat_css_node, &dcgrp->rstat_css_list); } /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; if (dst_root == &cgrp_dfl_root) { static_branch_enable(cgroup_subsys_on_dfl_key[ssid]); } else { dcgrp->subtree_control |= 1 << ssid; static_branch_disable(cgroup_subsys_on_dfl_key[ssid]); } ret = cgroup_apply_control(dcgrp); if (ret) pr_warn("partial failure to rebind %s controller (err=%d)\n", ss->name, ret); if (ss->bind) ss->bind(css); } while_each_subsys_mask(); kernfs_activate(dcgrp->kn); return 0; } int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, struct kernfs_root *kf_root) { int len = 0; char *buf = NULL; struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root); struct cgroup *ns_cgroup; buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!buf) return -ENOMEM; spin_lock_irq(&css_set_lock); ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot); len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX); spin_unlock_irq(&css_set_lock); if (len >= PATH_MAX) len = -ERANGE; else if (len > 0) { seq_escape(sf, buf, " \t\n\\"); len = 0; } kfree(buf); return len; } enum cgroup2_param { Opt_nsdelegate, Opt_favordynmods, Opt_memory_localevents, Opt_memory_recursiveprot, nr__cgroup2_params }; static const struct fs_parameter_spec cgroup2_fs_parameters[] = { fsparam_flag("nsdelegate", Opt_nsdelegate), fsparam_flag("favordynmods", Opt_favordynmods), fsparam_flag("memory_localevents", Opt_memory_localevents), fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot), {} }; static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); struct fs_parse_result result; int opt; opt = fs_parse(fc, cgroup2_fs_parameters, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_nsdelegate: ctx->flags |= CGRP_ROOT_NS_DELEGATE; return 0; case Opt_favordynmods: ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS; return 0; case Opt_memory_localevents: ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS; return 0; case Opt_memory_recursiveprot: ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT; return 0; } return -EINVAL; } static void apply_cgroup_root_flags(unsigned int root_flags) { if (current->nsproxy->cgroup_ns == &init_cgroup_ns) { if (root_flags & CGRP_ROOT_NS_DELEGATE) cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE; else cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE; cgroup_favor_dynmods(&cgrp_dfl_root, root_flags & CGRP_ROOT_FAVOR_DYNMODS); if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS; else cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS; if (root_flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT) cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT; else cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT; } } static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root) { if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) seq_puts(seq, ",nsdelegate"); if (cgrp_dfl_root.flags & CGRP_ROOT_FAVOR_DYNMODS) seq_puts(seq, ",favordynmods"); if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) seq_puts(seq, ",memory_localevents"); if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT) seq_puts(seq, ",memory_recursiveprot"); return 0; } static int cgroup_reconfigure(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); apply_cgroup_root_flags(ctx->flags); return 0; } static void init_cgroup_housekeeping(struct cgroup *cgrp) { struct cgroup_subsys *ss; int ssid; INIT_LIST_HEAD(&cgrp->self.sibling); INIT_LIST_HEAD(&cgrp->self.children); INIT_LIST_HEAD(&cgrp->cset_links); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); cgrp->self.cgroup = cgrp; cgrp->self.flags |= CSS_ONLINE; cgrp->dom_cgrp = cgrp; cgrp->max_descendants = INT_MAX; cgrp->max_depth = INT_MAX; INIT_LIST_HEAD(&cgrp->rstat_css_list); prev_cputime_init(&cgrp->prev_cputime); for_each_subsys(ss, ssid) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); init_waitqueue_head(&cgrp->offline_waitq); INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent); } void init_cgroup_root(struct cgroup_fs_context *ctx) { struct cgroup_root *root = ctx->root; struct cgroup *cgrp = &root->cgrp; INIT_LIST_HEAD(&root->root_list); atomic_set(&root->nr_cgrps, 1); cgrp->root = root; init_cgroup_housekeeping(cgrp); /* DYNMODS must be modified through cgroup_favor_dynmods() */ root->flags = ctx->flags & ~CGRP_ROOT_FAVOR_DYNMODS; if (ctx->release_agent) strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX); if (ctx->name) strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN); if (ctx->cpuset_clone_children) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; struct kernfs_syscall_ops *kf_sops; struct css_set *cset; int i, ret; lockdep_assert_held(&cgroup_mutex); ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, GFP_KERNEL); if (ret) goto out; /* * We're accessing css_set_count without locking css_set_lock here, * but that's OK - it can only be increased by someone holding * cgroup_lock, and that's us. Later rebinding may disable * controllers on the default hierarchy and thus create new csets, * which can't be more than the existing ones. Allocate 2x. */ ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links); if (ret) goto cancel_ref; ret = cgroup_init_root_id(root); if (ret) goto cancel_ref; kf_sops = root == &cgrp_dfl_root ? &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops; root->kf_root = kernfs_create_root(kf_sops, KERNFS_ROOT_CREATE_DEACTIVATED | KERNFS_ROOT_SUPPORT_EXPORTOP | KERNFS_ROOT_SUPPORT_USER_XATTR, root_cgrp); if (IS_ERR(root->kf_root)) { ret = PTR_ERR(root->kf_root); goto exit_root_id; } root_cgrp->kn = kernfs_root_to_node(root->kf_root); WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1); root_cgrp->ancestors[0] = root_cgrp; ret = css_populate_dir(&root_cgrp->self); if (ret) goto destroy_root; ret = cgroup_rstat_init(root_cgrp); if (ret) goto destroy_root; ret = rebind_subsystems(root, ss_mask); if (ret) goto exit_stats; ret = cgroup_bpf_inherit(root_cgrp); WARN_ON_ONCE(ret); trace_cgroup_setup_root(root); /* * There must be no failure case after here, since rebinding takes * care of subsystems' refcounts, which are explicitly dropped in * the failure exit path. */ list_add(&root->root_list, &cgroup_roots); cgroup_root_count++; /* * Link the root cgroup in this hierarchy into all the css_set * objects. */ spin_lock_irq(&css_set_lock); hash_for_each(css_set_table, i, cset, hlist) { link_css_set(&tmp_links, cset, root_cgrp); if (css_set_populated(cset)) cgroup_update_populated(root_cgrp, true); } spin_unlock_irq(&css_set_lock); BUG_ON(!list_empty(&root_cgrp->self.children)); BUG_ON(atomic_read(&root->nr_cgrps) != 1); ret = 0; goto out; exit_stats: cgroup_rstat_exit(root_cgrp); destroy_root: kernfs_destroy_root(root->kf_root); root->kf_root = NULL; exit_root_id: cgroup_exit_root_id(root); cancel_ref: percpu_ref_exit(&root_cgrp->self.refcnt); out: free_cgrp_cset_links(&tmp_links); return ret; } int cgroup_do_get_tree(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); int ret; ctx->kfc.root = ctx->root->kf_root; if (fc->fs_type == &cgroup2_fs_type) ctx->kfc.magic = CGROUP2_SUPER_MAGIC; else ctx->kfc.magic = CGROUP_SUPER_MAGIC; ret = kernfs_get_tree(fc); /* * In non-init cgroup namespace, instead of root cgroup's dentry, * we return the dentry corresponding to the cgroupns->root_cgrp. */ if (!ret && ctx->ns != &init_cgroup_ns) { struct dentry *nsdentry; struct super_block *sb = fc->root->d_sb; struct cgroup *cgrp; cgroup_lock(); spin_lock_irq(&css_set_lock); cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root); spin_unlock_irq(&css_set_lock); cgroup_unlock(); nsdentry = kernfs_node_dentry(cgrp->kn, sb); dput(fc->root); if (IS_ERR(nsdentry)) { deactivate_locked_super(sb); ret = PTR_ERR(nsdentry); nsdentry = NULL; } fc->root = nsdentry; } if (!ctx->kfc.new_sb_created) cgroup_put(&ctx->root->cgrp); return ret; } /* * Destroy a cgroup filesystem context. */ static void cgroup_fs_context_free(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); kfree(ctx->name); kfree(ctx->release_agent); put_cgroup_ns(ctx->ns); kernfs_free_fs_context(fc); kfree(ctx); } static int cgroup_get_tree(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); int ret; WRITE_ONCE(cgrp_dfl_visible, true); cgroup_get_live(&cgrp_dfl_root.cgrp); ctx->root = &cgrp_dfl_root; ret = cgroup_do_get_tree(fc); if (!ret) apply_cgroup_root_flags(ctx->flags); return ret; } static const struct fs_context_operations cgroup_fs_context_ops = { .free = cgroup_fs_context_free, .parse_param = cgroup2_parse_param, .get_tree = cgroup_get_tree, .reconfigure = cgroup_reconfigure, }; static const struct fs_context_operations cgroup1_fs_context_ops = { .free = cgroup_fs_context_free, .parse_param = cgroup1_parse_param, .get_tree = cgroup1_get_tree, .reconfigure = cgroup1_reconfigure, }; /* * Initialise the cgroup filesystem creation/reconfiguration context. Notably, * we select the namespace we're going to use. */ static int cgroup_init_fs_context(struct fs_context *fc) { struct cgroup_fs_context *ctx; ctx = kzalloc(sizeof(struct cgroup_fs_context), GFP_KERNEL); if (!ctx) return -ENOMEM; ctx->ns = current->nsproxy->cgroup_ns; get_cgroup_ns(ctx->ns); fc->fs_private = &ctx->kfc; if (fc->fs_type == &cgroup2_fs_type) fc->ops = &cgroup_fs_context_ops; else fc->ops = &cgroup1_fs_context_ops; put_user_ns(fc->user_ns); fc->user_ns = get_user_ns(ctx->ns->user_ns); fc->global = true; #ifdef CONFIG_CGROUP_FAVOR_DYNMODS ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS; #endif return 0; } static void cgroup_kill_sb(struct super_block *sb) { struct kernfs_root *kf_root = kernfs_root_from_sb(sb); struct cgroup_root *root = cgroup_root_from_kf(kf_root); /* * If @root doesn't have any children, start killing it. * This prevents new mounts by disabling percpu_ref_tryget_live(). * * And don't kill the default root. */ if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root && !percpu_ref_is_dying(&root->cgrp.self.refcnt)) { cgroup_bpf_offline(&root->cgrp); percpu_ref_kill(&root->cgrp.self.refcnt); } cgroup_put(&root->cgrp); kernfs_kill_sb(sb); } struct file_system_type cgroup_fs_type = { .name = "cgroup", .init_fs_context = cgroup_init_fs_context, .parameters = cgroup1_fs_parameters, .kill_sb = cgroup_kill_sb, .fs_flags = FS_USERNS_MOUNT, }; static struct file_system_type cgroup2_fs_type = { .name = "cgroup2", .init_fs_context = cgroup_init_fs_context, .parameters = cgroup2_fs_parameters, .kill_sb = cgroup_kill_sb, .fs_flags = FS_USERNS_MOUNT, }; #ifdef CONFIG_CPUSETS static const struct fs_context_operations cpuset_fs_context_ops = { .get_tree = cgroup1_get_tree, .free = cgroup_fs_context_free, }; /* * This is ugly, but preserves the userspace API for existing cpuset * users. If someone tries to mount the "cpuset" filesystem, we * silently switch it to mount "cgroup" instead */ static int cpuset_init_fs_context(struct fs_context *fc) { char *agent = kstrdup("/sbin/cpuset_release_agent", GFP_USER); struct cgroup_fs_context *ctx; int err; err = cgroup_init_fs_context(fc); if (err) { kfree(agent); return err; } fc->ops = &cpuset_fs_context_ops; ctx = cgroup_fc2context(fc); ctx->subsys_mask = 1 << cpuset_cgrp_id; ctx->flags |= CGRP_ROOT_NOPREFIX; ctx->release_agent = agent; get_filesystem(&cgroup_fs_type); put_filesystem(fc->fs_type); fc->fs_type = &cgroup_fs_type; return 0; } static struct file_system_type cpuset_fs_type = { .name = "cpuset", .init_fs_context = cpuset_init_fs_context, .fs_flags = FS_USERNS_MOUNT, }; #endif int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns) { struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root); return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen); } int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns) { int ret; cgroup_lock(); spin_lock_irq(&css_set_lock); ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns); spin_unlock_irq(&css_set_lock); cgroup_unlock(); return ret; } EXPORT_SYMBOL_GPL(cgroup_path_ns); /** * cgroup_attach_lock - Lock for ->attach() * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem * * cgroup migration sometimes needs to stabilize threadgroups against forks and * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach() * implementations (e.g. cpuset), also need to disable CPU hotplug. * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can * lead to deadlocks. * * Bringing up a CPU may involve creating and destroying tasks which requires * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while * write-locking threadgroup_rwsem, the locking order is reversed and we end up * waiting for an on-going CPU hotplug operation which in turn is waiting for * the threadgroup_rwsem to be released to create new tasks. For more details: * * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu * * Resolve the situation by always acquiring cpus_read_lock() before optionally * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that * CPU hotplug is disabled on entry. */ void cgroup_attach_lock(bool lock_threadgroup) { cpus_read_lock(); if (lock_threadgroup) percpu_down_write(&cgroup_threadgroup_rwsem); } /** * cgroup_attach_unlock - Undo cgroup_attach_lock() * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem */ void cgroup_attach_unlock(bool lock_threadgroup) { if (lock_threadgroup) percpu_up_write(&cgroup_threadgroup_rwsem); cpus_read_unlock(); } /** * cgroup_migrate_add_task - add a migration target task to a migration context * @task: target task * @mgctx: target migration context * * Add @task, which is a migration target, to @mgctx->tset. This function * becomes noop if @task doesn't need to be migrated. @task's css_set * should have been added as a migration source and @task->cg_list will be * moved from the css_set's tasks list to mg_tasks one. */ static void cgroup_migrate_add_task(struct task_struct *task, struct cgroup_mgctx *mgctx) { struct css_set *cset; lockdep_assert_held(&css_set_lock); /* @task either already exited or can't exit until the end */ if (task->flags & PF_EXITING) return; /* cgroup_threadgroup_rwsem protects racing against forks */ WARN_ON_ONCE(list_empty(&task->cg_list)); cset = task_css_set(task); if (!cset->mg_src_cgrp) return; mgctx->tset.nr_tasks++; list_move_tail(&task->cg_list, &cset->mg_tasks); if (list_empty(&cset->mg_node)) list_add_tail(&cset->mg_node, &mgctx->tset.src_csets); if (list_empty(&cset->mg_dst_cset->mg_node)) list_add_tail(&cset->mg_dst_cset->mg_node, &mgctx->tset.dst_csets); } /** * cgroup_taskset_first - reset taskset and return the first task * @tset: taskset of interest * @dst_cssp: output variable for the destination css * * @tset iteration is initialized and the first task is returned. */ struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset, struct cgroup_subsys_state **dst_cssp) { tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node); tset->cur_task = NULL; return cgroup_taskset_next(tset, dst_cssp); } /** * cgroup_taskset_next - iterate to the next task in taskset * @tset: taskset of interest * @dst_cssp: output variable for the destination css * * Return the next task in @tset. Iteration must have been initialized * with cgroup_taskset_first(). */ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, struct cgroup_subsys_state **dst_cssp) { struct css_set *cset = tset->cur_cset; struct task_struct *task = tset->cur_task; while (CGROUP_HAS_SUBSYS_CONFIG && &cset->mg_node != tset->csets) { if (!task) task = list_first_entry(&cset->mg_tasks, struct task_struct, cg_list); else task = list_next_entry(task, cg_list); if (&task->cg_list != &cset->mg_tasks) { tset->cur_cset = cset; tset->cur_task = task; /* * This function may be called both before and * after cgroup_migrate_execute(). The two cases * can be distinguished by looking at whether @cset * has its ->mg_dst_cset set. */ if (cset->mg_dst_cset) *dst_cssp = cset->mg_dst_cset->subsys[tset->ssid]; else *dst_cssp = cset->subsys[tset->ssid]; return task; } cset = list_next_entry(cset, mg_node); task = NULL; } return NULL; } /** * cgroup_migrate_execute - migrate a taskset * @mgctx: migration context * * Migrate tasks in @mgctx as setup by migration preparation functions. * This function fails iff one of the ->can_attach callbacks fails and * guarantees that either all or none of the tasks in @mgctx are migrated. * @mgctx is consumed regardless of success. */ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) { struct cgroup_taskset *tset = &mgctx->tset; struct cgroup_subsys *ss; struct task_struct *task, *tmp_task; struct css_set *cset, *tmp_cset; int ssid, failed_ssid, ret; /* check that we can legitimately attach to the cgroup */ if (tset->nr_tasks) { do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { if (ss->can_attach) { tset->ssid = ssid; ret = ss->can_attach(tset); if (ret) { failed_ssid = ssid; goto out_cancel_attach; } } } while_each_subsys_mask(); } /* * Now that we're guaranteed success, proceed to move all tasks to * the new cgroup. There are no failure cases after here, so this * is the commit point. */ spin_lock_irq(&css_set_lock); list_for_each_entry(cset, &tset->src_csets, mg_node) { list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) { struct css_set *from_cset = task_css_set(task); struct css_set *to_cset = cset->mg_dst_cset; get_css_set(to_cset); to_cset->nr_tasks++; css_set_move_task(task, from_cset, to_cset, true); from_cset->nr_tasks--; /* * If the source or destination cgroup is frozen, * the task might require to change its state. */ cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp, to_cset->dfl_cgrp); put_css_set_locked(from_cset); } } spin_unlock_irq(&css_set_lock); /* * Migration is committed, all target tasks are now on dst_csets. * Nothing is sensitive to fork() after this point. Notify * controllers that migration is complete. */ tset->csets = &tset->dst_csets; if (tset->nr_tasks) { do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { if (ss->attach) { tset->ssid = ssid; ss->attach(tset); } } while_each_subsys_mask(); } ret = 0; goto out_release_tset; out_cancel_attach: if (tset->nr_tasks) { do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { if (ssid == failed_ssid) break; if (ss->cancel_attach) { tset->ssid = ssid; ss->cancel_attach(tset); } } while_each_subsys_mask(); } out_release_tset: spin_lock_irq(&css_set_lock); list_splice_init(&tset->dst_csets, &tset->src_csets); list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) { list_splice_tail_init(&cset->mg_tasks, &cset->tasks); list_del_init(&cset->mg_node); } spin_unlock_irq(&css_set_lock); /* * Re-initialize the cgroup_taskset structure in case it is reused * again in another cgroup_migrate_add_task()/cgroup_migrate_execute() * iteration. */ tset->nr_tasks = 0; tset->csets = &tset->src_csets; return ret; } /** * cgroup_migrate_vet_dst - verify whether a cgroup can be migration destination * @dst_cgrp: destination cgroup to test * * On the default hierarchy, except for the mixable, (possible) thread root * and threaded cgroups, subtree_control must be zero for migration * destination cgroups with tasks so that child cgroups don't compete * against tasks. */ int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp) { /* v1 doesn't have any restriction */ if (!cgroup_on_dfl(dst_cgrp)) return 0; /* verify @dst_cgrp can host resources */ if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp)) return -EOPNOTSUPP; /* * If @dst_cgrp is already or can become a thread root or is * threaded, it doesn't matter. */ if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp)) return 0; /* apply no-internal-process constraint */ if (dst_cgrp->subtree_control) return -EBUSY; return 0; } /** * cgroup_migrate_finish - cleanup after attach * @mgctx: migration context * * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See * those functions for details. */ void cgroup_migrate_finish(struct cgroup_mgctx *mgctx) { struct css_set *cset, *tmp_cset; lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets, mg_src_preload_node) { cset->mg_src_cgrp = NULL; cset->mg_dst_cgrp = NULL; cset->mg_dst_cset = NULL; list_del_init(&cset->mg_src_preload_node); put_css_set_locked(cset); } list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets, mg_dst_preload_node) { cset->mg_src_cgrp = NULL; cset->mg_dst_cgrp = NULL; cset->mg_dst_cset = NULL; list_del_init(&cset->mg_dst_preload_node); put_css_set_locked(cset); } spin_unlock_irq(&css_set_lock); } /** * cgroup_migrate_add_src - add a migration source css_set * @src_cset: the source css_set to add * @dst_cgrp: the destination cgroup * @mgctx: migration context * * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin * @src_cset and add it to @mgctx->src_csets, which should later be cleaned * up by cgroup_migrate_finish(). * * This function may be called without holding cgroup_threadgroup_rwsem * even if the target is a process. Threads may be created and destroyed * but as long as cgroup_mutex is not dropped, no new css_set can be put * into play and the preloaded css_sets are guaranteed to cover all * migrations. */ void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp, struct cgroup_mgctx *mgctx) { struct cgroup *src_cgrp; lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&css_set_lock); /* * If ->dead, @src_set is associated with one or more dead cgroups * and doesn't contain any migratable tasks. Ignore it early so * that the rest of migration path doesn't get confused by it. */ if (src_cset->dead) return; if (!list_empty(&src_cset->mg_src_preload_node)) return; src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); WARN_ON(src_cset->mg_src_cgrp); WARN_ON(src_cset->mg_dst_cgrp); WARN_ON(!list_empty(&src_cset->mg_tasks)); WARN_ON(!list_empty(&src_cset->mg_node)); src_cset->mg_src_cgrp = src_cgrp; src_cset->mg_dst_cgrp = dst_cgrp; get_css_set(src_cset); list_add_tail(&src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets); } /** * cgroup_migrate_prepare_dst - prepare destination css_sets for migration * @mgctx: migration context * * Tasks are about to be moved and all the source css_sets have been * preloaded to @mgctx->preloaded_src_csets. This function looks up and * pins all destination css_sets, links each to its source, and append them * to @mgctx->preloaded_dst_csets. * * This function must be called after cgroup_migrate_add_src() has been * called on each migration source css_set. After migration is performed * using cgroup_migrate(), cgroup_migrate_finish() must be called on * @mgctx. */ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) { struct css_set *src_cset, *tmp_cset; lockdep_assert_held(&cgroup_mutex); /* look up the dst cset for each src cset and link it to src */ list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets, mg_src_preload_node) { struct css_set *dst_cset; struct cgroup_subsys *ss; int ssid; dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp); if (!dst_cset) return -ENOMEM; WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset); /* * If src cset equals dst, it's noop. Drop the src. * cgroup_migrate() will skip the cset too. Note that we * can't handle src == dst as some nodes are used by both. */ if (src_cset == dst_cset) { src_cset->mg_src_cgrp = NULL; src_cset->mg_dst_cgrp = NULL; list_del_init(&src_cset->mg_src_preload_node); put_css_set(src_cset); put_css_set(dst_cset); continue; } src_cset->mg_dst_cset = dst_cset; if (list_empty(&dst_cset->mg_dst_preload_node)) list_add_tail(&dst_cset->mg_dst_preload_node, &mgctx->preloaded_dst_csets); else put_css_set(dst_cset); for_each_subsys(ss, ssid) if (src_cset->subsys[ssid] != dst_cset->subsys[ssid]) mgctx->ss_mask |= 1 << ssid; } return 0; } /** * cgroup_migrate - migrate a process or task to a cgroup * @leader: the leader of the process or the task to migrate * @threadgroup: whether @leader points to the whole process or a single task * @mgctx: migration context * * Migrate a process or task denoted by @leader. If migrating a process, * the caller must be holding cgroup_threadgroup_rwsem. The caller is also * responsible for invoking cgroup_migrate_add_src() and * cgroup_migrate_prepare_dst() on the targets before invoking this * function and following up with cgroup_migrate_finish(). * * As long as a controller's ->can_attach() doesn't fail, this function is * guaranteed to succeed. This means that, excluding ->can_attach() * failure, when migrating multiple targets, the success or failure can be * decided for all targets by invoking group_migrate_prepare_dst() before * actually starting migrating. */ int cgroup_migrate(struct task_struct *leader, bool threadgroup, struct cgroup_mgctx *mgctx) { struct task_struct *task; /* * The following thread iteration should be inside an RCU critical * section to prevent tasks from being freed while taking the snapshot. * spin_lock_irq() implies RCU critical section here. */ spin_lock_irq(&css_set_lock); task = leader; do { cgroup_migrate_add_task(task, mgctx); if (!threadgroup) break; } while_each_thread(leader, task); spin_unlock_irq(&css_set_lock); return cgroup_migrate_execute(mgctx); } /** * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup * @dst_cgrp: the cgroup to attach to * @leader: the task or the leader of the threadgroup to be attached * @threadgroup: attach the whole threadgroup? * * Call holding cgroup_mutex and cgroup_threadgroup_rwsem. */ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup) { DEFINE_CGROUP_MGCTX(mgctx); struct task_struct *task; int ret = 0; /* look up all src csets */ spin_lock_irq(&css_set_lock); rcu_read_lock(); task = leader; do { cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx); if (!threadgroup) break; } while_each_thread(leader, task); rcu_read_unlock(); spin_unlock_irq(&css_set_lock); /* prepare dst csets and commit */ ret = cgroup_migrate_prepare_dst(&mgctx); if (!ret) ret = cgroup_migrate(leader, threadgroup, &mgctx); cgroup_migrate_finish(&mgctx); if (!ret) TRACE_CGROUP_PATH(attach_task, dst_cgrp, leader, threadgroup); return ret; } struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, bool *threadgroup_locked) { struct task_struct *tsk; pid_t pid; if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return ERR_PTR(-EINVAL); /* * If we migrate a single thread, we don't care about threadgroup * stability. If the thread is `current`, it won't exit(2) under our * hands or change PID through exec(2). We exclude * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write * callers by cgroup_mutex. * Therefore, we can skip the global lock. */ lockdep_assert_held(&cgroup_mutex); *threadgroup_locked = pid || threadgroup; cgroup_attach_lock(*threadgroup_locked); rcu_read_lock(); if (pid) { tsk = find_task_by_vpid(pid); if (!tsk) { tsk = ERR_PTR(-ESRCH); goto out_unlock_threadgroup; } } else { tsk = current; } if (threadgroup) tsk = tsk->group_leader; /* * kthreads may acquire PF_NO_SETAFFINITY during initialization. * If userland migrates such a kthread to a non-root cgroup, it can * become trapped in a cpuset, or RT kthread may be born in a * cgroup with no rt_runtime allocated. Just say no. */ if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) { tsk = ERR_PTR(-EINVAL); goto out_unlock_threadgroup; } get_task_struct(tsk); goto out_unlock_rcu; out_unlock_threadgroup: cgroup_attach_unlock(*threadgroup_locked); *threadgroup_locked = false; out_unlock_rcu: rcu_read_unlock(); return tsk; } void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked) { struct cgroup_subsys *ss; int ssid; /* release reference from cgroup_procs_write_start() */ put_task_struct(task); cgroup_attach_unlock(threadgroup_locked); for_each_subsys(ss, ssid) if (ss->post_attach) ss->post_attach(); } static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) { struct cgroup_subsys *ss; bool printed = false; int ssid; do_each_subsys_mask(ss, ssid, ss_mask) { if (printed) seq_putc(seq, ' '); seq_puts(seq, ss->name); printed = true; } while_each_subsys_mask(); if (printed) seq_putc(seq, '\n'); } /* show controllers which are enabled from the parent */ static int cgroup_controllers_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; cgroup_print_ss_mask(seq, cgroup_control(cgrp)); return 0; } /* show controllers which are enabled for a given cgroup's children */ static int cgroup_subtree_control_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; cgroup_print_ss_mask(seq, cgrp->subtree_control); return 0; } /** * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy * @cgrp: root of the subtree to update csses for * * @cgrp's control masks have changed and its subtree's css associations * need to be updated accordingly. This function looks up all css_sets * which are attached to the subtree, creates the matching updated css_sets * and migrates the tasks to the new ones. */ static int cgroup_update_dfl_csses(struct cgroup *cgrp) { DEFINE_CGROUP_MGCTX(mgctx); struct cgroup_subsys_state *d_css; struct cgroup *dsct; struct css_set *src_cset; bool has_tasks; int ret; lockdep_assert_held(&cgroup_mutex); /* look up all csses currently attached to @cgrp's subtree */ spin_lock_irq(&css_set_lock); cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { struct cgrp_cset_link *link; /* * As cgroup_update_dfl_csses() is only called by * cgroup_apply_control(). The csses associated with the * given cgrp will not be affected by changes made to * its subtree_control file. We can skip them. */ if (dsct == cgrp) continue; list_for_each_entry(link, &dsct->cset_links, cset_link) cgroup_migrate_add_src(link->cset, dsct, &mgctx); } spin_unlock_irq(&css_set_lock); /* * We need to write-lock threadgroup_rwsem while migrating tasks. * However, if there are no source csets for @cgrp, changing its * controllers isn't gonna produce any task migrations and the * write-locking can be skipped safely. */ has_tasks = !list_empty(&mgctx.preloaded_src_csets); cgroup_attach_lock(has_tasks); /* NULL dst indicates self on default hierarchy */ ret = cgroup_migrate_prepare_dst(&mgctx); if (ret) goto out_finish; spin_lock_irq(&css_set_lock); list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_src_preload_node) { struct task_struct *task, *ntask; /* all tasks in src_csets need to be migrated */ list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list) cgroup_migrate_add_task(task, &mgctx); } spin_unlock_irq(&css_set_lock); ret = cgroup_migrate_execute(&mgctx); out_finish: cgroup_migrate_finish(&mgctx); cgroup_attach_unlock(has_tasks); return ret; } /** * cgroup_lock_and_drain_offline - lock cgroup_mutex and drain offlined csses * @cgrp: root of the target subtree * * Because css offlining is asynchronous, userland may try to re-enable a * controller while the previous css is still around. This function grabs * cgroup_mutex and drains the previous css instances of @cgrp's subtree. */ void cgroup_lock_and_drain_offline(struct cgroup *cgrp) __acquires(&cgroup_mutex) { struct cgroup *dsct; struct cgroup_subsys_state *d_css; struct cgroup_subsys *ss; int ssid; restart: cgroup_lock(); cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cgroup_css(dsct, ss); DEFINE_WAIT(wait); if (!css || !percpu_ref_is_dying(&css->refcnt)) continue; cgroup_get_live(dsct); prepare_to_wait(&dsct->offline_waitq, &wait, TASK_UNINTERRUPTIBLE); cgroup_unlock(); schedule(); finish_wait(&dsct->offline_waitq, &wait); cgroup_put(dsct); goto restart; } } } /** * cgroup_save_control - save control masks and dom_cgrp of a subtree * @cgrp: root of the target subtree * * Save ->subtree_control, ->subtree_ss_mask and ->dom_cgrp to the * respective old_ prefixed fields for @cgrp's subtree including @cgrp * itself. */ static void cgroup_save_control(struct cgroup *cgrp) { struct cgroup *dsct; struct cgroup_subsys_state *d_css; cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { dsct->old_subtree_control = dsct->subtree_control; dsct->old_subtree_ss_mask = dsct->subtree_ss_mask; dsct->old_dom_cgrp = dsct->dom_cgrp; } } /** * cgroup_propagate_control - refresh control masks of a subtree * @cgrp: root of the target subtree * * For @cgrp and its subtree, ensure ->subtree_ss_mask matches * ->subtree_control and propagate controller availability through the * subtree so that descendants don't have unavailable controllers enabled. */ static void cgroup_propagate_control(struct cgroup *cgrp) { struct cgroup *dsct; struct cgroup_subsys_state *d_css; cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { dsct->subtree_control &= cgroup_control(dsct); dsct->subtree_ss_mask = cgroup_calc_subtree_ss_mask(dsct->subtree_control, cgroup_ss_mask(dsct)); } } /** * cgroup_restore_control - restore control masks and dom_cgrp of a subtree * @cgrp: root of the target subtree * * Restore ->subtree_control, ->subtree_ss_mask and ->dom_cgrp from the * respective old_ prefixed fields for @cgrp's subtree including @cgrp * itself. */ static void cgroup_restore_control(struct cgroup *cgrp) { struct cgroup *dsct; struct cgroup_subsys_state *d_css; cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { dsct->subtree_control = dsct->old_subtree_control; dsct->subtree_ss_mask = dsct->old_subtree_ss_mask; dsct->dom_cgrp = dsct->old_dom_cgrp; } } static bool css_visible(struct cgroup_subsys_state *css) { struct cgroup_subsys *ss = css->ss; struct cgroup *cgrp = css->cgroup; if (cgroup_control(cgrp) & (1 << ss->id)) return true; if (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) return false; return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl; } /** * cgroup_apply_control_enable - enable or show csses according to control * @cgrp: root of the target subtree * * Walk @cgrp's subtree and create new csses or make the existing ones * visible. A css is created invisible if it's being implicitly enabled * through dependency. An invisible css is made visible when the userland * explicitly enables it. * * Returns 0 on success, -errno on failure. On failure, csses which have * been processed already aren't cleaned up. The caller is responsible for * cleaning up with cgroup_apply_control_disable(). */ static int cgroup_apply_control_enable(struct cgroup *cgrp) { struct cgroup *dsct; struct cgroup_subsys_state *d_css; struct cgroup_subsys *ss; int ssid, ret; cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cgroup_css(dsct, ss); if (!(cgroup_ss_mask(dsct) & (1 << ss->id))) continue; if (!css) { css = css_create(dsct, ss); if (IS_ERR(css)) return PTR_ERR(css); } WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt)); if (css_visible(css)) { ret = css_populate_dir(css); if (ret) return ret; } } } return 0; } /** * cgroup_apply_control_disable - kill or hide csses according to control * @cgrp: root of the target subtree * * Walk @cgrp's subtree and kill and hide csses so that they match * cgroup_ss_mask() and cgroup_visible_mask(). * * A css is hidden when the userland requests it to be disabled while other * subsystems are still depending on it. The css must not actively control * resources and be in the vanilla state if it's made visible again later. * Controllers which may be depended upon should provide ->css_reset() for * this purpose. */ static void cgroup_apply_control_disable(struct cgroup *cgrp) { struct cgroup *dsct; struct cgroup_subsys_state *d_css; struct cgroup_subsys *ss; int ssid; cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cgroup_css(dsct, ss); if (!css) continue; WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt)); if (css->parent && !(cgroup_ss_mask(dsct) & (1 << ss->id))) { kill_css(css); } else if (!css_visible(css)) { css_clear_dir(css); if (ss->css_reset) ss->css_reset(css); } } } } /** * cgroup_apply_control - apply control mask updates to the subtree * @cgrp: root of the target subtree * * subsystems can be enabled and disabled in a subtree using the following * steps. * * 1. Call cgroup_save_control() to stash the current state. * 2. Update ->subtree_control masks in the subtree as desired. * 3. Call cgroup_apply_control() to apply the changes. * 4. Optionally perform other related operations. * 5. Call cgroup_finalize_control() to finish up. * * This function implements step 3 and propagates the mask changes * throughout @cgrp's subtree, updates csses accordingly and perform * process migrations. */ static int cgroup_apply_control(struct cgroup *cgrp) { int ret; cgroup_propagate_control(cgrp); ret = cgroup_apply_control_enable(cgrp); if (ret) return ret; /* * At this point, cgroup_e_css_by_mask() results reflect the new csses * making the following cgroup_update_dfl_csses() properly update * css associations of all tasks in the subtree. */ return cgroup_update_dfl_csses(cgrp); } /** * cgroup_finalize_control - finalize control mask update * @cgrp: root of the target subtree * @ret: the result of the update * * Finalize control mask update. See cgroup_apply_control() for more info. */ static void cgroup_finalize_control(struct cgroup *cgrp, int ret) { if (ret) { cgroup_restore_control(cgrp); cgroup_propagate_control(cgrp); } cgroup_apply_control_disable(cgrp); } static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable) { u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask; /* if nothing is getting enabled, nothing to worry about */ if (!enable) return 0; /* can @cgrp host any resources? */ if (!cgroup_is_valid_domain(cgrp->dom_cgrp)) return -EOPNOTSUPP; /* mixables don't care */ if (cgroup_is_mixable(cgrp)) return 0; if (domain_enable) { /* can't enable domain controllers inside a thread subtree */ if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp)) return -EOPNOTSUPP; } else { /* * Threaded controllers can handle internal competitions * and are always allowed inside a (prospective) thread * subtree. */ if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp)) return 0; } /* * Controllers can't be enabled for a cgroup with tasks to avoid * child cgroups competing against tasks. */ if (cgroup_has_tasks(cgrp)) return -EBUSY; return 0; } /* change the enabled child controllers for a cgroup in the default hierarchy */ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { u16 enable = 0, disable = 0; struct cgroup *cgrp, *child; struct cgroup_subsys *ss; char *tok; int ssid, ret; /* * Parse input - space separated list of subsystem names prefixed * with either + or -. */ buf = strstrip(buf); while ((tok = strsep(&buf, " "))) { if (tok[0] == '\0') continue; do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) { if (!cgroup_ssid_enabled(ssid) || strcmp(tok + 1, ss->name)) continue; if (*tok == '+') { enable |= 1 << ssid; disable &= ~(1 << ssid); } else if (*tok == '-') { disable |= 1 << ssid; enable &= ~(1 << ssid); } else { return -EINVAL; } break; } while_each_subsys_mask(); if (ssid == CGROUP_SUBSYS_COUNT) return -EINVAL; } cgrp = cgroup_kn_lock_live(of->kn, true); if (!cgrp) return -ENODEV; for_each_subsys(ss, ssid) { if (enable & (1 << ssid)) { if (cgrp->subtree_control & (1 << ssid)) { enable &= ~(1 << ssid); continue; } if (!(cgroup_control(cgrp) & (1 << ssid))) { ret = -ENOENT; goto out_unlock; } } else if (disable & (1 << ssid)) { if (!(cgrp->subtree_control & (1 << ssid))) { disable &= ~(1 << ssid); continue; } /* a child has it enabled? */ cgroup_for_each_live_child(child, cgrp) { if (child->subtree_control & (1 << ssid)) { ret = -EBUSY; goto out_unlock; } } } } if (!enable && !disable) { ret = 0; goto out_unlock; } ret = cgroup_vet_subtree_control_enable(cgrp, enable); if (ret) goto out_unlock; /* save and update control masks and prepare csses */ cgroup_save_control(cgrp); cgrp->subtree_control |= enable; cgrp->subtree_control &= ~disable; ret = cgroup_apply_control(cgrp); cgroup_finalize_control(cgrp, ret); if (ret) goto out_unlock; kernfs_activate(cgrp->kn); out_unlock: cgroup_kn_unlock(of->kn); return ret ?: nbytes; } /** * cgroup_enable_threaded - make @cgrp threaded * @cgrp: the target cgroup * * Called when "threaded" is written to the cgroup.type interface file and * tries to make @cgrp threaded and join the parent's resource domain. * This function is never called on the root cgroup as cgroup.type doesn't * exist on it. */ static int cgroup_enable_threaded(struct cgroup *cgrp) { struct cgroup *parent = cgroup_parent(cgrp); struct cgroup *dom_cgrp = parent->dom_cgrp; struct cgroup *dsct; struct cgroup_subsys_state *d_css; int ret; lockdep_assert_held(&cgroup_mutex); /* noop if already threaded */ if (cgroup_is_threaded(cgrp)) return 0; /* * If @cgroup is populated or has domain controllers enabled, it * can't be switched. While the below cgroup_can_be_thread_root() * test can catch the same conditions, that's only when @parent is * not mixable, so let's check it explicitly. */ if (cgroup_is_populated(cgrp) || cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask) return -EOPNOTSUPP; /* we're joining the parent's domain, ensure its validity */ if (!cgroup_is_valid_domain(dom_cgrp) || !cgroup_can_be_thread_root(dom_cgrp)) return -EOPNOTSUPP; /* * The following shouldn't cause actual migrations and should * always succeed. */ cgroup_save_control(cgrp); cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) if (dsct == cgrp || cgroup_is_threaded(dsct)) dsct->dom_cgrp = dom_cgrp; ret = cgroup_apply_control(cgrp); if (!ret) parent->nr_threaded_children++; cgroup_finalize_control(cgrp, ret); return ret; } static int cgroup_type_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; if (cgroup_is_threaded(cgrp)) seq_puts(seq, "threaded\n"); else if (!cgroup_is_valid_domain(cgrp)) seq_puts(seq, "domain invalid\n"); else if (cgroup_is_thread_root(cgrp)) seq_puts(seq, "domain threaded\n"); else seq_puts(seq, "domain\n"); return 0; } static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup *cgrp; int ret; /* only switching to threaded mode is supported */ if (strcmp(strstrip(buf), "threaded")) return -EINVAL; /* drain dying csses before we re-apply (threaded) subtree control */ cgrp = cgroup_kn_lock_live(of->kn, true); if (!cgrp) return -ENOENT; /* threaded can only be enabled */ ret = cgroup_enable_threaded(cgrp); cgroup_kn_unlock(of->kn); return ret ?: nbytes; } static int cgroup_max_descendants_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; int descendants = READ_ONCE(cgrp->max_descendants); if (descendants == INT_MAX) seq_puts(seq, "max\n"); else seq_printf(seq, "%d\n", descendants); return 0; } static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup *cgrp; int descendants; ssize_t ret; buf = strstrip(buf); if (!strcmp(buf, "max")) { descendants = INT_MAX; } else { ret = kstrtoint(buf, 0, &descendants); if (ret) return ret; } if (descendants < 0) return -ERANGE; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENOENT; cgrp->max_descendants = descendants; cgroup_kn_unlock(of->kn); return nbytes; } static int cgroup_max_depth_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; int depth = READ_ONCE(cgrp->max_depth); if (depth == INT_MAX) seq_puts(seq, "max\n"); else seq_printf(seq, "%d\n", depth); return 0; } static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup *cgrp; ssize_t ret; int depth; buf = strstrip(buf); if (!strcmp(buf, "max")) { depth = INT_MAX; } else { ret = kstrtoint(buf, 0, &depth); if (ret) return ret; } if (depth < 0) return -ERANGE; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENOENT; cgrp->max_depth = depth; cgroup_kn_unlock(of->kn); return nbytes; } static int cgroup_events_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; seq_printf(seq, "populated %d\n", cgroup_is_populated(cgrp)); seq_printf(seq, "frozen %d\n", test_bit(CGRP_FROZEN, &cgrp->flags)); return 0; } static int cgroup_stat_show(struct seq_file *seq, void *v) { struct cgroup *cgroup = seq_css(seq)->cgroup; seq_printf(seq, "nr_descendants %d\n", cgroup->nr_descendants); seq_printf(seq, "nr_dying_descendants %d\n", cgroup->nr_dying_descendants); return 0; } #ifdef CONFIG_CGROUP_SCHED /** * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem * @cgrp: the cgroup of interest * @ss: the subsystem of interest * * Find and get @cgrp's css associated with @ss. If the css doesn't exist * or is offline, %NULL is returned. */ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { struct cgroup_subsys_state *css; rcu_read_lock(); css = cgroup_css(cgrp, ss); if (css && !css_tryget_online(css)) css = NULL; rcu_read_unlock(); return css; } static int cgroup_extra_stat_show(struct seq_file *seq, int ssid) { struct cgroup *cgrp = seq_css(seq)->cgroup; struct cgroup_subsys *ss = cgroup_subsys[ssid]; struct cgroup_subsys_state *css; int ret; if (!ss->css_extra_stat_show) return 0; css = cgroup_tryget_css(cgrp, ss); if (!css) return 0; ret = ss->css_extra_stat_show(seq, css); css_put(css); return ret; } static int cgroup_local_stat_show(struct seq_file *seq, struct cgroup *cgrp, int ssid) { struct cgroup_subsys *ss = cgroup_subsys[ssid]; struct cgroup_subsys_state *css; int ret; if (!ss->css_local_stat_show) return 0; css = cgroup_tryget_css(cgrp, ss); if (!css) return 0; ret = ss->css_local_stat_show(seq, css); css_put(css); return ret; } #endif static int cpu_stat_show(struct seq_file *seq, void *v) { int ret = 0; cgroup_base_stat_cputime_show(seq); #ifdef CONFIG_CGROUP_SCHED ret = cgroup_extra_stat_show(seq, cpu_cgrp_id); #endif return ret; } static int cpu_local_stat_show(struct seq_file *seq, void *v) { struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup; int ret = 0; #ifdef CONFIG_CGROUP_SCHED ret = cgroup_local_stat_show(seq, cgrp, cpu_cgrp_id); #endif return ret; } #ifdef CONFIG_PSI static int cgroup_io_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; struct psi_group *psi = cgroup_psi(cgrp); return psi_show(seq, psi, PSI_IO); } static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; struct psi_group *psi = cgroup_psi(cgrp); return psi_show(seq, psi, PSI_MEM); } static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; struct psi_group *psi = cgroup_psi(cgrp); return psi_show(seq, psi, PSI_CPU); } static ssize_t pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, enum psi_res res) { struct cgroup_file_ctx *ctx = of->priv; struct psi_trigger *new; struct cgroup *cgrp; struct psi_group *psi; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; cgroup_get(cgrp); cgroup_kn_unlock(of->kn); /* Allow only one trigger per file descriptor */ if (ctx->psi.trigger) { cgroup_put(cgrp); return -EBUSY; } psi = cgroup_psi(cgrp); new = psi_trigger_create(psi, buf, res, of->file, of); if (IS_ERR(new)) { cgroup_put(cgrp); return PTR_ERR(new); } smp_store_release(&ctx->psi.trigger, new); cgroup_put(cgrp); return nbytes; } static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return pressure_write(of, buf, nbytes, PSI_IO); } static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return pressure_write(of, buf, nbytes, PSI_MEM); } static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return pressure_write(of, buf, nbytes, PSI_CPU); } #ifdef CONFIG_IRQ_TIME_ACCOUNTING static int cgroup_irq_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; struct psi_group *psi = cgroup_psi(cgrp); return psi_show(seq, psi, PSI_IRQ); } static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return pressure_write(of, buf, nbytes, PSI_IRQ); } #endif static int cgroup_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; struct psi_group *psi = cgroup_psi(cgrp); seq_printf(seq, "%d\n", psi->enabled); return 0; } static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { ssize_t ret; int enable; struct cgroup *cgrp; struct psi_group *psi; ret = kstrtoint(strstrip(buf), 0, &enable); if (ret) return ret; if (enable < 0 || enable > 1) return -ERANGE; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENOENT; psi = cgroup_psi(cgrp); if (psi->enabled != enable) { int i; /* show or hide {cpu,memory,io,irq}.pressure files */ for (i = 0; i < NR_PSI_RESOURCES; i++) cgroup_file_show(&cgrp->psi_files[i], enable); psi->enabled = enable; if (enable) psi_cgroup_restart(psi); } cgroup_kn_unlock(of->kn); return nbytes; } static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, poll_table *pt) { struct cgroup_file_ctx *ctx = of->priv; return psi_trigger_poll(&ctx->psi.trigger, of->file, pt); } static int cgroup_pressure_open(struct kernfs_open_file *of) { if (of->file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE)) return -EPERM; return 0; } static void cgroup_pressure_release(struct kernfs_open_file *of) { struct cgroup_file_ctx *ctx = of->priv; psi_trigger_destroy(ctx->psi.trigger); } bool cgroup_psi_enabled(void) { if (static_branch_likely(&psi_disabled)) return false; return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0; } #else /* CONFIG_PSI */ bool cgroup_psi_enabled(void) { return false; } #endif /* CONFIG_PSI */ static int cgroup_freeze_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; seq_printf(seq, "%d\n", cgrp->freezer.freeze); return 0; } static ssize_t cgroup_freeze_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup *cgrp; ssize_t ret; int freeze; ret = kstrtoint(strstrip(buf), 0, &freeze); if (ret) return ret; if (freeze < 0 || freeze > 1) return -ERANGE; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENOENT; cgroup_freeze(cgrp, freeze); cgroup_kn_unlock(of->kn); return nbytes; } static void __cgroup_kill(struct cgroup *cgrp) { struct css_task_iter it; struct task_struct *task; lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); set_bit(CGRP_KILL, &cgrp->flags); spin_unlock_irq(&css_set_lock); css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it); while ((task = css_task_iter_next(&it))) { /* Ignore kernel threads here. */ if (task->flags & PF_KTHREAD) continue; /* Skip tasks that are already dying. */ if (__fatal_signal_pending(task)) continue; send_sig(SIGKILL, task, 0); } css_task_iter_end(&it); spin_lock_irq(&css_set_lock); clear_bit(CGRP_KILL, &cgrp->flags); spin_unlock_irq(&css_set_lock); } static void cgroup_kill(struct cgroup *cgrp) { struct cgroup_subsys_state *css; struct cgroup *dsct; lockdep_assert_held(&cgroup_mutex); cgroup_for_each_live_descendant_pre(dsct, css, cgrp) __cgroup_kill(dsct); } static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { ssize_t ret = 0; int kill; struct cgroup *cgrp; ret = kstrtoint(strstrip(buf), 0, &kill); if (ret) return ret; if (kill != 1) return -ERANGE; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENOENT; /* * Killing is a process directed operation, i.e. the whole thread-group * is taken down so act like we do for cgroup.procs and only make this * writable in non-threaded cgroups. */ if (cgroup_is_threaded(cgrp)) ret = -EOPNOTSUPP; else cgroup_kill(cgrp); cgroup_kn_unlock(of->kn); return ret ?: nbytes; } static int cgroup_file_open(struct kernfs_open_file *of) { struct cftype *cft = of_cft(of); struct cgroup_file_ctx *ctx; int ret; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return -ENOMEM; ctx->ns = current->nsproxy->cgroup_ns; get_cgroup_ns(ctx->ns); of->priv = ctx; if (!cft->open) return 0; ret = cft->open(of); if (ret) { put_cgroup_ns(ctx->ns); kfree(ctx); } return ret; } static void cgroup_file_release(struct kernfs_open_file *of) { struct cftype *cft = of_cft(of); struct cgroup_file_ctx *ctx = of->priv; if (cft->release) cft->release(of); put_cgroup_ns(ctx->ns); kfree(ctx); } static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup_file_ctx *ctx = of->priv; struct cgroup *cgrp = of->kn->parent->priv; struct cftype *cft = of_cft(of); struct cgroup_subsys_state *css; int ret; if (!nbytes) return 0; /* * If namespaces are delegation boundaries, disallow writes to * files in an non-init namespace root from inside the namespace * except for the files explicitly marked delegatable - * cgroup.procs and cgroup.subtree_control. */ if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) && !(cft->flags & CFTYPE_NS_DELEGATABLE) && ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp) return -EPERM; if (cft->write) return cft->write(of, buf, nbytes, off); /* * kernfs guarantees that a file isn't deleted with operations in * flight, which means that the matching css is and stays alive and * doesn't need to be pinned. The RCU locking is not necessary * either. It's just for the convenience of using cgroup_css(). */ rcu_read_lock(); css = cgroup_css(cgrp, cft->ss); rcu_read_unlock(); if (cft->write_u64) { unsigned long long v; ret = kstrtoull(buf, 0, &v); if (!ret) ret = cft->write_u64(css, cft, v); } else if (cft->write_s64) { long long v; ret = kstrtoll(buf, 0, &v); if (!ret) ret = cft->write_s64(css, cft, v); } else { ret = -EINVAL; } return ret ?: nbytes; } static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt) { struct cftype *cft = of_cft(of); if (cft->poll) return cft->poll(of, pt); return kernfs_generic_poll(of, pt); } static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) { return seq_cft(seq)->seq_start(seq, ppos); } static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos) { return seq_cft(seq)->seq_next(seq, v, ppos); } static void cgroup_seqfile_stop(struct seq_file *seq, void *v) { if (seq_cft(seq)->seq_stop) seq_cft(seq)->seq_stop(seq, v); } static int cgroup_seqfile_show(struct seq_file *m, void *arg) { struct cftype *cft = seq_cft(m); struct cgroup_subsys_state *css = seq_css(m); if (cft->seq_show) return cft->seq_show(m, arg); if (cft->read_u64) seq_printf(m, "%llu\n", cft->read_u64(css, cft)); else if (cft->read_s64) seq_printf(m, "%lld\n", cft->read_s64(css, cft)); else return -EINVAL; return 0; } static struct kernfs_ops cgroup_kf_single_ops = { .atomic_write_len = PAGE_SIZE, .open = cgroup_file_open, .release = cgroup_file_release, .write = cgroup_file_write, .poll = cgroup_file_poll, .seq_show = cgroup_seqfile_show, }; static struct kernfs_ops cgroup_kf_ops = { .atomic_write_len = PAGE_SIZE, .open = cgroup_file_open, .release = cgroup_file_release, .write = cgroup_file_write, .poll = cgroup_file_poll, .seq_start = cgroup_seqfile_start, .seq_next = cgroup_seqfile_next, .seq_stop = cgroup_seqfile_stop, .seq_show = cgroup_seqfile_show, }; /* set uid and gid of cgroup dirs and files to that of the creator */ static int cgroup_kn_set_ugid(struct kernfs_node *kn) { struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, .ia_uid = current_fsuid(), .ia_gid = current_fsgid(), }; if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) && gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID)) return 0; return kernfs_setattr(kn, &iattr); } static void cgroup_file_notify_timer(struct timer_list *timer) { cgroup_file_notify(container_of(timer, struct cgroup_file, notify_timer)); } static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype *cft) { char name[CGROUP_FILE_NAME_MAX]; struct kernfs_node *kn; struct lock_class_key *key = NULL; int ret; #ifdef CONFIG_DEBUG_LOCK_ALLOC key = &cft->lockdep_key; #endif kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name), cgroup_file_mode(cft), GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0, cft->kf_ops, cft, NULL, key); if (IS_ERR(kn)) return PTR_ERR(kn); ret = cgroup_kn_set_ugid(kn); if (ret) { kernfs_remove(kn); return ret; } if (cft->file_offset) { struct cgroup_file *cfile = (void *)css + cft->file_offset; timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0); spin_lock_irq(&cgroup_file_kn_lock); cfile->kn = kn; spin_unlock_irq(&cgroup_file_kn_lock); } return 0; } /** * cgroup_addrm_files - add or remove files to a cgroup directory * @css: the target css * @cgrp: the target cgroup (usually css->cgroup) * @cfts: array of cftypes to be added * @is_add: whether to add or remove * * Depending on @is_add, add or remove files defined by @cfts on @cgrp. * For removals, this function never fails. */ static int cgroup_addrm_files(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype cfts[], bool is_add) { struct cftype *cft, *cft_end = NULL; int ret = 0; lockdep_assert_held(&cgroup_mutex); restart: for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) continue; if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp)) continue; if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp)) continue; if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp)) continue; if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug) continue; if (is_add) { ret = cgroup_add_file(css, cgrp, cft); if (ret) { pr_warn("%s: failed to add %s, err=%d\n", __func__, cft->name, ret); cft_end = cft; is_add = false; goto restart; } } else { cgroup_rm_file(cgrp, cft); } } return ret; } static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) { struct cgroup_subsys *ss = cfts[0].ss; struct cgroup *root = &ss->root->cgrp; struct cgroup_subsys_state *css; int ret = 0; lockdep_assert_held(&cgroup_mutex); /* add/rm files for all cgroups created before */ css_for_each_descendant_pre(css, cgroup_css(root, ss)) { struct cgroup *cgrp = css->cgroup; if (!(css->flags & CSS_VISIBLE)) continue; ret = cgroup_addrm_files(css, cgrp, cfts, is_add); if (ret) break; } if (is_add && !ret) kernfs_activate(root->kn); return ret; } static void cgroup_exit_cftypes(struct cftype *cfts) { struct cftype *cft; for (cft = cfts; cft->name[0] != '\0'; cft++) { /* free copy for custom atomic_write_len, see init_cftypes() */ if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) kfree(cft->kf_ops); cft->kf_ops = NULL; cft->ss = NULL; /* revert flags set by cgroup core while adding @cfts */ cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL | __CFTYPE_ADDED); } } static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype *cft; int ret = 0; for (cft = cfts; cft->name[0] != '\0'; cft++) { struct kernfs_ops *kf_ops; WARN_ON(cft->ss || cft->kf_ops); if (cft->flags & __CFTYPE_ADDED) { ret = -EBUSY; break; } if (cft->seq_start) kf_ops = &cgroup_kf_ops; else kf_ops = &cgroup_kf_single_ops; /* * Ugh... if @cft wants a custom max_write_len, we need to * make a copy of kf_ops to set its atomic_write_len. */ if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) { kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL); if (!kf_ops) { ret = -ENOMEM; break; } kf_ops->atomic_write_len = cft->max_write_len; } cft->kf_ops = kf_ops; cft->ss = ss; cft->flags |= __CFTYPE_ADDED; } if (ret) cgroup_exit_cftypes(cfts); return ret; } static void cgroup_rm_cftypes_locked(struct cftype *cfts) { lockdep_assert_held(&cgroup_mutex); list_del(&cfts->node); cgroup_apply_cftypes(cfts, false); cgroup_exit_cftypes(cfts); } /** * cgroup_rm_cftypes - remove an array of cftypes from a subsystem * @cfts: zero-length name terminated array of cftypes * * Unregister @cfts. Files described by @cfts are removed from all * existing cgroups and all future cgroups won't have them either. This * function can be called anytime whether @cfts' subsys is attached or not. * * Returns 0 on successful unregistration, -ENOENT if @cfts is not * registered. */ int cgroup_rm_cftypes(struct cftype *cfts) { if (!cfts || cfts[0].name[0] == '\0') return 0; if (!(cfts[0].flags & __CFTYPE_ADDED)) return -ENOENT; cgroup_lock(); cgroup_rm_cftypes_locked(cfts); cgroup_unlock(); return 0; } /** * cgroup_add_cftypes - add an array of cftypes to a subsystem * @ss: target cgroup subsystem * @cfts: zero-length name terminated array of cftypes * * Register @cfts to @ss. Files described by @cfts are created for all * existing cgroups to which @ss is attached and all future cgroups will * have them too. This function can be called anytime whether @ss is * attached or not. * * Returns 0 on successful registration, -errno on failure. Note that this * function currently returns 0 as long as @cfts registration is successful * even if some file creation attempts on existing cgroups fail. */ static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { int ret; if (!cgroup_ssid_enabled(ss->id)) return 0; if (!cfts || cfts[0].name[0] == '\0') return 0; ret = cgroup_init_cftypes(ss, cfts); if (ret) return ret; cgroup_lock(); list_add_tail(&cfts->node, &ss->cfts); ret = cgroup_apply_cftypes(cfts, true); if (ret) cgroup_rm_cftypes_locked(cfts); cgroup_unlock(); return ret; } /** * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy * @ss: target cgroup subsystem * @cfts: zero-length name terminated array of cftypes * * Similar to cgroup_add_cftypes() but the added files are only used for * the default hierarchy. */ int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype *cft; for (cft = cfts; cft && cft->name[0] != '\0'; cft++) cft->flags |= __CFTYPE_ONLY_ON_DFL; return cgroup_add_cftypes(ss, cfts); } /** * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies * @ss: target cgroup subsystem * @cfts: zero-length name terminated array of cftypes * * Similar to cgroup_add_cftypes() but the added files are only used for * the legacy hierarchies. */ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype *cft; for (cft = cfts; cft && cft->name[0] != '\0'; cft++) cft->flags |= __CFTYPE_NOT_ON_DFL; return cgroup_add_cftypes(ss, cfts); } /** * cgroup_file_notify - generate a file modified event for a cgroup_file * @cfile: target cgroup_file * * @cfile must have been obtained by setting cftype->file_offset. */ void cgroup_file_notify(struct cgroup_file *cfile) { unsigned long flags; spin_lock_irqsave(&cgroup_file_kn_lock, flags); if (cfile->kn) { unsigned long last = cfile->notified_at; unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV; if (time_in_range(jiffies, last, next)) { timer_reduce(&cfile->notify_timer, next); } else { kernfs_notify(cfile->kn); cfile->notified_at = jiffies; } } spin_unlock_irqrestore(&cgroup_file_kn_lock, flags); } /** * cgroup_file_show - show or hide a hidden cgroup file * @cfile: target cgroup_file obtained by setting cftype->file_offset * @show: whether to show or hide */ void cgroup_file_show(struct cgroup_file *cfile, bool show) { struct kernfs_node *kn; spin_lock_irq(&cgroup_file_kn_lock); kn = cfile->kn; kernfs_get(kn); spin_unlock_irq(&cgroup_file_kn_lock); if (kn) kernfs_show(kn, show); kernfs_put(kn); } /** * css_next_child - find the next child of a given css * @pos: the current position (%NULL to ini |