Total coverage: 146890 (8%)of 1852487
5062 438 119 164 155 14 9023 9014 9009 5054 5265 69 7380 382 1695 1697 1693 5 1695 1695 1696 1688 5740 5753 5248 546 5753 1442 1444 31 1310 1412 1438 1408 1913 1879 155 240 1909 500 628 1617 1910 1915 58 58 58 1 52 57 58 57 2168 2173 2171 2169 2169 541 542 544 544 1932 5 192 1 285 32 46 16 1964 650 9 1690 47 1975 8 1979 1974 47 367 68 2590 8 2711 1052 1025 52 52 25 24 30 864 830 34 850 30 61 62 34 15 2 6 6 4 4 15 504 398 2171 1 2173 2172 2166 49 2171 2127 91 2170 2171 2173 2173 2174 2171 2229 2232 2232 2229 2230 2186 121 2229 2184 121 2230 38 37 2227 2224 2229 1 1 1 2230 2230 2231 2227 115 2229 2234 2234 115 2225 12 2234 2233 2230 9 2227 137 12 185 7 2 2229 2231 2190 361 2231 1 2229 2230 2227 2229 1479 1228 2229 2232 29 2234 2228 21 2016 45 2183 2226 2229 4 2226 69 2227 2228 13 2230 2 2232 1 2186 121 2 2228 5 2234 2229 2231 16 2229 8 10 2231 16 5 5 5 7 9 6 9 9 10 8 5 3 7 5 9 10 5 6 9 12 12 13 2212 163 497 1899 2 1 43 18 2233 1979 2228 2231 2232 2232 13 2229 2231 55 12 4 2074 315 314 3 5 2230 64 1 2 5 62 2 65 66 65 10 10 1 3 3 1 3 68 76 76 5 10 66 66 2 6 6 8 9 1 58 5 66 66 66 66 64 4 68 19 19 1 12 57 5 14 14 14 1689 4 1705 1707 6 1698 8 9 1873 3 1 1536 1535 1532 1532 4 1529 2 2 1529 2 1530 2 1524 2 1454 57 1 6 50 57 1 59 3 2 53 1 4 42 3 5 1 2 35 3 3 1 26 3 4 20 1 3 14 2 2 11 1 1 8 1 1 1 1 2 1 3 269 265 2 27 27 27 40 79 56 153 153 153 18 8 2 1 1 2 4 3 5 5 517 27 126 374 496 4 501 498 2 463 36 36 18 19 447 10 30 368 116 468 3 8 441 34 471 5 474 379 380 380 45 74 445 2 4 434 445 2 3 444 2 6 444 3 3 448 1 448 1 1 439 11 437 14 371 4 7 3 66 2 372 3 368 1 2 322 39 37 9 361 18 301 1 3 3 2 28 7 2 5 137 111 289 307 120 501 242 21 88 1 5 82 4 75 75 75 48 2 25 5 72 75 2 1 1 1 2 155 174 2 2 4 165 1 146 47 13 9 157 9 45 2 162 102 706 526 527 50 702 1051 27 874 164 8 897 161 1 2 1049 9 1004 3 33 986 49 1028 10 1032 3 1036 4 1030 7 1033 4 1034 5 1032 5 1034 3 1033 1 1038 1 14 5 270 20 193 322 14 30 98 7 114 1053 1054 37 1015 15 1035 3 479 699 699 536 178 587 146 144 2 44 44 1 8 13 21 357 12 211 486 1102 9 116 4 14 1053 1813 5 5 1802 1440 378 1404 382 30 111 1295 70 1336 1183 66 4 5 6 1627 1621 3 1623 1624 1615 1625 1404 1 18 3 3 4 5 5 4 4 32 32 22 2 15 1 11 4 8 2 5 5 8 15 2 7 4 10 34 1 9 2 12 1 9 2 2 14 2 11 4 5 19 16 71 71 8 5 40 5 5 5 5 66 66 41 66 65 66 42 21 66 2166 2069 205 2163 408 680 2048 73 2117 1997 1994 4 4 17 8 1 17 1 6 6 9 3 1 1 2 2 4 13 38 2 5 3 40 1 4 2 2 2 1 35 29 5 3 3 20 9 25 6 4 1 2 1 38 13 25 2 1 2 21 2 2 10 1 25 2 6 6 2 2 4 3 20 3 12 2 11 12 11 12 12 1 3 1 4 2 5 5 1 1 1 1 1 8 1 2 2 4 29 12 17 13 13 1 3 15 15 12 9 3 2 9 12 11 8 12 1 4 4 1 6 13 13 1 3 8 3 1 25 25 2 1 1 1 1 2 1 2 1 3 2 2 2 1 1 2 3 5 13 5 10 10 6 5 3 8 22 22 14 13 13 13 2 12 13 2 12 13 9 9 61 1 10 7 53 47 1 23 49 11 20 13 30 6 6 2 11 8 4 22 33 1 6 5 24 2 7 5 12 14 9 9 2 2 1 7 7 21 12 12 12 4 12 5 1 2 2 14 1 2 2 4 12 14 2 12 3 2 4 4 2 24 24 5 15 8 16 23 13 1 3 14 24 11 5 1 2 6 24 2 14 24 1 12 12 12 2 5 12 12 10 10 5 12 5 3 4 7 5 1 1 6 6 8 4 4 4 4 4 1 27 5 21 1 3 42 7 7 3 3 19 19 1 1 1 2 10 21 20 1 3 12 12 12 2 2 2 19 19 1 3 2 1 5 1 2 1 2 1 1 1 1 1 6 2 4 3 2 3 3 8 1 1 1 1 1 2 1 13 1 8 1 1 1 1 1 22 1 1 1 1 5 1 5 6 1 1 15 1 21 1 7 2 1 1 1 9 10 1 1 1 1 5 1 24 9 15 11 1 1 1 1 3 1 3 458 36 438 20 364 23 70 42 101 5743 7 2 5740 5746 5282 483 29 411 71 483 378 101 5286 294 5286 1 3406 2061 3 3 5774 7 6 3 2186 2180 703 114 114 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Routing netlink socket interface: protocol independent part. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * Fixes: * Vitaly E. Lavrov RTA_OK arithmetic was wrong. */ #include <linux/bitops.h> #include <linux/errno.h> #include <linux/module.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/kernel.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/interrupt.h> #include <linux/capability.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/security.h> #include <linux/mutex.h> #include <linux/if_addr.h> #include <linux/if_bridge.h> #include <linux/if_vlan.h> #include <linux/pci.h> #include <linux/etherdevice.h> #include <linux/bpf.h> #include <linux/uaccess.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <net/ip.h> #include <net/protocol.h> #include <net/arp.h> #include <net/route.h> #include <net/udp.h> #include <net/tcp.h> #include <net/sock.h> #include <net/pkt_sched.h> #include <net/fib_rules.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/netdev_lock.h> #include <net/devlink.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/addrconf.h> #endif #include <linux/dpll.h> #include "dev.h" #define RTNL_MAX_TYPE 50 #define RTNL_SLAVE_MAX_TYPE 44 struct rtnl_link { rtnl_doit_func doit; rtnl_dumpit_func dumpit; struct module *owner; unsigned int flags; struct rcu_head rcu; }; static DEFINE_MUTEX(rtnl_mutex); void rtnl_lock(void) { mutex_lock(&rtnl_mutex); } EXPORT_SYMBOL(rtnl_lock); int rtnl_lock_interruptible(void) { return mutex_lock_interruptible(&rtnl_mutex); } int rtnl_lock_killable(void) { return mutex_lock_killable(&rtnl_mutex); } static struct sk_buff *defer_kfree_skb_list; void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail) { if (head && tail) { tail->next = defer_kfree_skb_list; defer_kfree_skb_list = head; } } EXPORT_SYMBOL(rtnl_kfree_skbs); void __rtnl_unlock(void) { struct sk_buff *head = defer_kfree_skb_list; defer_kfree_skb_list = NULL; /* Ensure that we didn't actually add any TODO item when __rtnl_unlock() * is used. In some places, e.g. in cfg80211, we have code that will do * something like * rtnl_lock() * wiphy_lock() * ... * rtnl_unlock() * * and because netdev_run_todo() acquires the RTNL for items on the list * we could cause a situation such as this: * Thread 1 Thread 2 * rtnl_lock() * unregister_netdevice() * __rtnl_unlock() * rtnl_lock() * wiphy_lock() * rtnl_unlock() * netdev_run_todo() * __rtnl_unlock() * * // list not empty now * // because of thread 2 * rtnl_lock() * while (!list_empty(...)) * rtnl_lock() * wiphy_lock() * **** DEADLOCK **** * * However, usage of __rtnl_unlock() is rare, and so we can ensure that * it's not used in cases where something is added to do the list. */ WARN_ON(!list_empty(&net_todo_list)); mutex_unlock(&rtnl_mutex); while (head) { struct sk_buff *next = head->next; kfree_skb(head); cond_resched(); head = next; } } void rtnl_unlock(void) { /* This fellow will unlock it for us. */ netdev_run_todo(); } EXPORT_SYMBOL(rtnl_unlock); int rtnl_trylock(void) { return mutex_trylock(&rtnl_mutex); } EXPORT_SYMBOL(rtnl_trylock); int rtnl_is_locked(void) { return mutex_is_locked(&rtnl_mutex); } EXPORT_SYMBOL(rtnl_is_locked); bool refcount_dec_and_rtnl_lock(refcount_t *r) { return refcount_dec_and_mutex_lock(r, &rtnl_mutex); } EXPORT_SYMBOL(refcount_dec_and_rtnl_lock); #ifdef CONFIG_PROVE_LOCKING bool lockdep_rtnl_is_held(void) { return lockdep_is_held(&rtnl_mutex); } EXPORT_SYMBOL(lockdep_rtnl_is_held); #endif /* #ifdef CONFIG_PROVE_LOCKING */ #ifdef CONFIG_DEBUG_NET_SMALL_RTNL void __rtnl_net_lock(struct net *net) { ASSERT_RTNL(); mutex_lock(&net->rtnl_mutex); } EXPORT_SYMBOL(__rtnl_net_lock); void __rtnl_net_unlock(struct net *net) { ASSERT_RTNL(); mutex_unlock(&net->rtnl_mutex); } EXPORT_SYMBOL(__rtnl_net_unlock); void rtnl_net_lock(struct net *net) { rtnl_lock(); __rtnl_net_lock(net); } EXPORT_SYMBOL(rtnl_net_lock); void rtnl_net_unlock(struct net *net) { __rtnl_net_unlock(net); rtnl_unlock(); } EXPORT_SYMBOL(rtnl_net_unlock); int rtnl_net_trylock(struct net *net) { int ret = rtnl_trylock(); if (ret) __rtnl_net_lock(net); return ret; } EXPORT_SYMBOL(rtnl_net_trylock); int rtnl_net_lock_killable(struct net *net) { int ret = rtnl_lock_killable(); if (!ret) __rtnl_net_lock(net); return ret; } static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b) { if (net_eq(net_a, net_b)) return 0; /* always init_net first */ if (net_eq(net_a, &init_net)) return -1; if (net_eq(net_b, &init_net)) return 1; /* otherwise lock in ascending order */ return net_a < net_b ? -1 : 1; } int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b) { const struct net *net_a, *net_b; net_a = container_of(a, struct net, rtnl_mutex.dep_map); net_b = container_of(b, struct net, rtnl_mutex.dep_map); return rtnl_net_cmp_locks(net_a, net_b); } bool rtnl_net_is_locked(struct net *net) { return rtnl_is_locked() && mutex_is_locked(&net->rtnl_mutex); } EXPORT_SYMBOL(rtnl_net_is_locked); bool lockdep_rtnl_net_is_held(struct net *net) { return lockdep_rtnl_is_held() && lockdep_is_held(&net->rtnl_mutex); } EXPORT_SYMBOL(lockdep_rtnl_net_is_held); #else static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b) { /* No need to swap */ return -1; } #endif struct rtnl_nets { /* ->newlink() needs to freeze 3 netns at most; * 2 for the new device, 1 for its peer. */ struct net *net[3]; unsigned char len; }; static void rtnl_nets_init(struct rtnl_nets *rtnl_nets) { memset(rtnl_nets, 0, sizeof(*rtnl_nets)); } static void rtnl_nets_destroy(struct rtnl_nets *rtnl_nets) { int i; for (i = 0; i < rtnl_nets->len; i++) { put_net(rtnl_nets->net[i]); rtnl_nets->net[i] = NULL; } rtnl_nets->len = 0; } /** * rtnl_nets_add - Add netns to be locked before ->newlink(). * * @rtnl_nets: rtnl_nets pointer passed to ->get_peer_net(). * @net: netns pointer with an extra refcnt held. * * The extra refcnt is released in rtnl_nets_destroy(). */ static void rtnl_nets_add(struct rtnl_nets *rtnl_nets, struct net *net) { int i; DEBUG_NET_WARN_ON_ONCE(rtnl_nets->len == ARRAY_SIZE(rtnl_nets->net)); for (i = 0; i < rtnl_nets->len; i++) { switch (rtnl_net_cmp_locks(rtnl_nets->net[i], net)) { case 0: put_net(net); return; case 1: swap(rtnl_nets->net[i], net); } } rtnl_nets->net[i] = net; rtnl_nets->len++; } static void rtnl_nets_lock(struct rtnl_nets *rtnl_nets) { int i; rtnl_lock(); for (i = 0; i < rtnl_nets->len; i++) __rtnl_net_lock(rtnl_nets->net[i]); } static void rtnl_nets_unlock(struct rtnl_nets *rtnl_nets) { int i; for (i = 0; i < rtnl_nets->len; i++) __rtnl_net_unlock(rtnl_nets->net[i]); rtnl_unlock(); } static struct rtnl_link __rcu *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; static inline int rtm_msgindex(int msgtype) { int msgindex = msgtype - RTM_BASE; /* * msgindex < 0 implies someone tried to register a netlink * control code. msgindex >= RTM_NR_MSGTYPES may indicate that * the message type has not been added to linux/rtnetlink.h */ BUG_ON(msgindex < 0 || msgindex >= RTM_NR_MSGTYPES); return msgindex; } static struct rtnl_link *rtnl_get_link(int protocol, int msgtype) { struct rtnl_link __rcu **tab; if (protocol >= ARRAY_SIZE(rtnl_msg_handlers)) protocol = PF_UNSPEC; tab = rcu_dereference_rtnl(rtnl_msg_handlers[protocol]); if (!tab) tab = rcu_dereference_rtnl(rtnl_msg_handlers[PF_UNSPEC]); return rcu_dereference_rtnl(tab[msgtype]); } static int rtnl_register_internal(struct module *owner, int protocol, int msgtype, rtnl_doit_func doit, rtnl_dumpit_func dumpit, unsigned int flags) { struct rtnl_link *link, *old; struct rtnl_link __rcu **tab; int msgindex; int ret = -ENOBUFS; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); msgindex = rtm_msgindex(msgtype); rtnl_lock(); tab = rtnl_dereference(rtnl_msg_handlers[protocol]); if (tab == NULL) { tab = kcalloc(RTM_NR_MSGTYPES, sizeof(void *), GFP_KERNEL); if (!tab) goto unlock; /* ensures we see the 0 stores */ rcu_assign_pointer(rtnl_msg_handlers[protocol], tab); } old = rtnl_dereference(tab[msgindex]); if (old) { link = kmemdup(old, sizeof(*old), GFP_KERNEL); if (!link) goto unlock; } else { link = kzalloc(sizeof(*link), GFP_KERNEL); if (!link) goto unlock; } WARN_ON(link->owner && link->owner != owner); link->owner = owner; WARN_ON(doit && link->doit && link->doit != doit); if (doit) link->doit = doit; WARN_ON(dumpit && link->dumpit && link->dumpit != dumpit); if (dumpit) link->dumpit = dumpit; WARN_ON(rtnl_msgtype_kind(msgtype) != RTNL_KIND_DEL && (flags & RTNL_FLAG_BULK_DEL_SUPPORTED)); link->flags |= flags; /* publish protocol:msgtype */ rcu_assign_pointer(tab[msgindex], link); ret = 0; if (old) kfree_rcu(old, rcu); unlock: rtnl_unlock(); return ret; } /** * rtnl_unregister - Unregister a rtnetlink message type * @protocol: Protocol family or PF_UNSPEC * @msgtype: rtnetlink message type * * Returns 0 on success or a negative error code. */ static int rtnl_unregister(int protocol, int msgtype) { struct rtnl_link __rcu **tab; struct rtnl_link *link; int msgindex; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); msgindex = rtm_msgindex(msgtype); rtnl_lock(); tab = rtnl_dereference(rtnl_msg_handlers[protocol]); if (!tab) { rtnl_unlock(); return -ENOENT; } link = rcu_replace_pointer_rtnl(tab[msgindex], NULL); rtnl_unlock(); kfree_rcu(link, rcu); return 0; } /** * rtnl_unregister_all - Unregister all rtnetlink message type of a protocol * @protocol : Protocol family or PF_UNSPEC * * Identical to calling rtnl_unregster() for all registered message types * of a certain protocol family. */ void rtnl_unregister_all(int protocol) { struct rtnl_link __rcu **tab; struct rtnl_link *link; int msgindex; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); rtnl_lock(); tab = rcu_replace_pointer_rtnl(rtnl_msg_handlers[protocol], NULL); if (!tab) { rtnl_unlock(); return; } for (msgindex = 0; msgindex < RTM_NR_MSGTYPES; msgindex++) { link = rcu_replace_pointer_rtnl(tab[msgindex], NULL); kfree_rcu(link, rcu); } rtnl_unlock(); synchronize_net(); kfree(tab); } EXPORT_SYMBOL_GPL(rtnl_unregister_all); /** * __rtnl_register_many - Register rtnetlink message types * @handlers: Array of struct rtnl_msg_handlers * @n: The length of @handlers * * Registers the specified function pointers (at least one of them has * to be non-NULL) to be called whenever a request message for the * specified protocol family and message type is received. * * The special protocol family PF_UNSPEC may be used to define fallback * function pointers for the case when no entry for the specific protocol * family exists. * * When one element of @handlers fails to register, * 1) built-in: panics. * 2) modules : the previous successful registrations are unwinded * and an error is returned. * * Use rtnl_register_many(). */ int __rtnl_register_many(const struct rtnl_msg_handler *handlers, int n) { const struct rtnl_msg_handler *handler; int i, err; for (i = 0, handler = handlers; i < n; i++, handler++) { err = rtnl_register_internal(handler->owner, handler->protocol, handler->msgtype, handler->doit, handler->dumpit, handler->flags); if (err) { if (!handler->owner) panic("Unable to register rtnetlink message " "handlers, %pS\n", handlers); __rtnl_unregister_many(handlers, i); break; } } return err; } EXPORT_SYMBOL_GPL(__rtnl_register_many); void __rtnl_unregister_many(const struct rtnl_msg_handler *handlers, int n) { const struct rtnl_msg_handler *handler; int i; for (i = n - 1, handler = handlers + n - 1; i >= 0; i--, handler--) rtnl_unregister(handler->protocol, handler->msgtype); } EXPORT_SYMBOL_GPL(__rtnl_unregister_many); static DEFINE_MUTEX(link_ops_mutex); static LIST_HEAD(link_ops); static struct rtnl_link_ops *rtnl_link_ops_get(const char *kind, int *srcu_index) { struct rtnl_link_ops *ops; rcu_read_lock(); list_for_each_entry_rcu(ops, &link_ops, list) { if (!strcmp(ops->kind, kind)) { *srcu_index = srcu_read_lock(&ops->srcu); goto unlock; } } ops = NULL; unlock: rcu_read_unlock(); return ops; } static void rtnl_link_ops_put(struct rtnl_link_ops *ops, int srcu_index) { srcu_read_unlock(&ops->srcu, srcu_index); } /** * rtnl_link_register - Register rtnl_link_ops with rtnetlink. * @ops: struct rtnl_link_ops * to register * * Returns 0 on success or a negative error code. */ int rtnl_link_register(struct rtnl_link_ops *ops) { struct rtnl_link_ops *tmp; int err; /* Sanity-check max sizes to avoid stack buffer overflow. */ if (WARN_ON(ops->maxtype > RTNL_MAX_TYPE || ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE)) return -EINVAL; /* The check for alloc/setup is here because if ops * does not have that filled up, it is not possible * to use the ops for creating device. So do not * fill up dellink as well. That disables rtnl_dellink. */ if ((ops->alloc || ops->setup) && !ops->dellink) ops->dellink = unregister_netdevice_queue; err = init_srcu_struct(&ops->srcu); if (err) return err; mutex_lock(&link_ops_mutex); list_for_each_entry(tmp, &link_ops, list) { if (!strcmp(ops->kind, tmp->kind)) { err = -EEXIST; goto unlock; } } list_add_tail_rcu(&ops->list, &link_ops); unlock: mutex_unlock(&link_ops_mutex); return err; } EXPORT_SYMBOL_GPL(rtnl_link_register); static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops) { struct net_device *dev; LIST_HEAD(list_kill); for_each_netdev(net, dev) { if (dev->rtnl_link_ops == ops) ops->dellink(dev, &list_kill); } unregister_netdevice_many(&list_kill); } /* Return with the rtnl_lock held when there are no network * devices unregistering in any network namespace. */ static void rtnl_lock_unregistering_all(void) { DEFINE_WAIT_FUNC(wait, woken_wake_function); add_wait_queue(&netdev_unregistering_wq, &wait); for (;;) { rtnl_lock(); /* We held write locked pernet_ops_rwsem, and parallel * setup_net() and cleanup_net() are not possible. */ if (!atomic_read(&dev_unreg_count)) break; __rtnl_unlock(); wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } remove_wait_queue(&netdev_unregistering_wq, &wait); } /** * rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink. * @ops: struct rtnl_link_ops * to unregister */ void rtnl_link_unregister(struct rtnl_link_ops *ops) { struct net *net; mutex_lock(&link_ops_mutex); list_del_rcu(&ops->list); mutex_unlock(&link_ops_mutex); synchronize_srcu(&ops->srcu); cleanup_srcu_struct(&ops->srcu); /* Close the race with setup_net() and cleanup_net() */ down_write(&pernet_ops_rwsem); rtnl_lock_unregistering_all(); for_each_net(net) __rtnl_kill_links(net, ops); rtnl_unlock(); up_write(&pernet_ops_rwsem); } EXPORT_SYMBOL_GPL(rtnl_link_unregister); static size_t rtnl_link_get_slave_info_data_size(const struct net_device *dev) { struct net_device *master_dev; const struct rtnl_link_ops *ops; size_t size = 0; rcu_read_lock(); master_dev = netdev_master_upper_dev_get_rcu((struct net_device *)dev); if (!master_dev) goto out; ops = master_dev->rtnl_link_ops; if (!ops || !ops->get_slave_size) goto out; /* IFLA_INFO_SLAVE_DATA + nested data */ size = nla_total_size(sizeof(struct nlattr)) + ops->get_slave_size(master_dev, dev); out: rcu_read_unlock(); return size; } static size_t rtnl_link_get_size(const struct net_device *dev) { const struct rtnl_link_ops *ops = dev->rtnl_link_ops; size_t size; if (!ops) return 0; size = nla_total_size(sizeof(struct nlattr)) + /* IFLA_LINKINFO */ nla_total_size(strlen(ops->kind) + 1); /* IFLA_INFO_KIND */ if (ops->get_size) /* IFLA_INFO_DATA + nested data */ size += nla_total_size(sizeof(struct nlattr)) + ops->get_size(dev); if (ops->get_xstats_size) /* IFLA_INFO_XSTATS */ size += nla_total_size(ops->get_xstats_size(dev)); size += rtnl_link_get_slave_info_data_size(dev); return size; } static LIST_HEAD(rtnl_af_ops); static struct rtnl_af_ops *rtnl_af_lookup(const int family, int *srcu_index) { struct rtnl_af_ops *ops; ASSERT_RTNL(); rcu_read_lock(); list_for_each_entry_rcu(ops, &rtnl_af_ops, list) { if (ops->family == family) { *srcu_index = srcu_read_lock(&ops->srcu); goto unlock; } } ops = NULL; unlock: rcu_read_unlock(); return ops; } static void rtnl_af_put(struct rtnl_af_ops *ops, int srcu_index) { srcu_read_unlock(&ops->srcu, srcu_index); } /** * rtnl_af_register - Register rtnl_af_ops with rtnetlink. * @ops: struct rtnl_af_ops * to register * * Return: 0 on success or a negative error code. */ int rtnl_af_register(struct rtnl_af_ops *ops) { int err = init_srcu_struct(&ops->srcu); if (err) return err; rtnl_lock(); list_add_tail_rcu(&ops->list, &rtnl_af_ops); rtnl_unlock(); return 0; } EXPORT_SYMBOL_GPL(rtnl_af_register); /** * rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink. * @ops: struct rtnl_af_ops * to unregister */ void rtnl_af_unregister(struct rtnl_af_ops *ops) { rtnl_lock(); list_del_rcu(&ops->list); rtnl_unlock(); synchronize_rcu(); synchronize_srcu(&ops->srcu); cleanup_srcu_struct(&ops->srcu); } EXPORT_SYMBOL_GPL(rtnl_af_unregister); static size_t rtnl_link_get_af_size(const struct net_device *dev, u32 ext_filter_mask) { struct rtnl_af_ops *af_ops; size_t size; /* IFLA_AF_SPEC */ size = nla_total_size(sizeof(struct nlattr)); rcu_read_lock(); list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { if (af_ops->get_link_af_size) { /* AF_* + nested data */ size += nla_total_size(sizeof(struct nlattr)) + af_ops->get_link_af_size(dev, ext_filter_mask); } } rcu_read_unlock(); return size; } static bool rtnl_have_link_slave_info(const struct net_device *dev) { struct net_device *master_dev; bool ret = false; rcu_read_lock(); master_dev = netdev_master_upper_dev_get_rcu((struct net_device *)dev); if (master_dev && master_dev->rtnl_link_ops) ret = true; rcu_read_unlock(); return ret; } static int rtnl_link_slave_info_fill(struct sk_buff *skb, const struct net_device *dev) { struct net_device *master_dev; const struct rtnl_link_ops *ops; struct nlattr *slave_data; int err; master_dev = netdev_master_upper_dev_get((struct net_device *) dev); if (!master_dev) return 0; ops = master_dev->rtnl_link_ops; if (!ops) return 0; if (nla_put_string(skb, IFLA_INFO_SLAVE_KIND, ops->kind) < 0) return -EMSGSIZE; if (ops->fill_slave_info) { slave_data = nla_nest_start_noflag(skb, IFLA_INFO_SLAVE_DATA); if (!slave_data) return -EMSGSIZE; err = ops->fill_slave_info(skb, master_dev, dev); if (err < 0) goto err_cancel_slave_data; nla_nest_end(skb, slave_data); } return 0; err_cancel_slave_data: nla_nest_cancel(skb, slave_data); return err; } static int rtnl_link_info_fill(struct sk_buff *skb, const struct net_device *dev) { const struct rtnl_link_ops *ops = dev->rtnl_link_ops; struct nlattr *data; int err; if (!ops) return 0; if (nla_put_string(skb, IFLA_INFO_KIND, ops->kind) < 0) return -EMSGSIZE; if (ops->fill_xstats) { err = ops->fill_xstats(skb, dev); if (err < 0) return err; } if (ops->fill_info) { data = nla_nest_start_noflag(skb, IFLA_INFO_DATA); if (data == NULL) return -EMSGSIZE; err = ops->fill_info(skb, dev); if (err < 0) goto err_cancel_data; nla_nest_end(skb, data); } return 0; err_cancel_data: nla_nest_cancel(skb, data); return err; } static int rtnl_link_fill(struct sk_buff *skb, const struct net_device *dev) { struct nlattr *linkinfo; int err = -EMSGSIZE; linkinfo = nla_nest_start_noflag(skb, IFLA_LINKINFO); if (linkinfo == NULL) goto out; err = rtnl_link_info_fill(skb, dev); if (err < 0) goto err_cancel_link; err = rtnl_link_slave_info_fill(skb, dev); if (err < 0) goto err_cancel_link; nla_nest_end(skb, linkinfo); return 0; err_cancel_link: nla_nest_cancel(skb, linkinfo); out: return err; } int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned int group, int echo) { struct sock *rtnl = net->rtnl; return nlmsg_notify(rtnl, skb, pid, group, echo, GFP_KERNEL); } int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid) { struct sock *rtnl = net->rtnl; return nlmsg_unicast(rtnl, skb, pid); } EXPORT_SYMBOL(rtnl_unicast); void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, const struct nlmsghdr *nlh, gfp_t flags) { struct sock *rtnl = net->rtnl; nlmsg_notify(rtnl, skb, pid, group, nlmsg_report(nlh), flags); } EXPORT_SYMBOL(rtnl_notify); void rtnl_set_sk_err(struct net *net, u32 group, int error) { struct sock *rtnl = net->rtnl; netlink_set_err(rtnl, 0, group, error); } EXPORT_SYMBOL(rtnl_set_sk_err); int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics) { struct nlattr *mx; int i, valid = 0; /* nothing is dumped for dst_default_metrics, so just skip the loop */ if (metrics == dst_default_metrics.metrics) return 0; mx = nla_nest_start_noflag(skb, RTA_METRICS); if (mx == NULL) return -ENOBUFS; for (i = 0; i < RTAX_MAX; i++) { if (metrics[i]) { if (i == RTAX_CC_ALGO - 1) { char tmp[TCP_CA_NAME_MAX], *name; name = tcp_ca_get_name_by_key(metrics[i], tmp); if (!name) continue; if (nla_put_string(skb, i + 1, name)) goto nla_put_failure; } else if (i == RTAX_FEATURES - 1) { u32 user_features = metrics[i] & RTAX_FEATURE_MASK; if (!user_features) continue; BUILD_BUG_ON(RTAX_FEATURE_MASK & DST_FEATURE_MASK); if (nla_put_u32(skb, i + 1, user_features)) goto nla_put_failure; } else { if (nla_put_u32(skb, i + 1, metrics[i])) goto nla_put_failure; } valid++; } } if (!valid) { nla_nest_cancel(skb, mx); return 0; } return nla_nest_end(skb, mx); nla_put_failure: nla_nest_cancel(skb, mx); return -EMSGSIZE; } EXPORT_SYMBOL(rtnetlink_put_metrics); int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, long expires, u32 error) { struct rta_cacheinfo ci = { .rta_error = error, .rta_id = id, }; if (dst) { ci.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse); ci.rta_used = dst->__use; ci.rta_clntref = rcuref_read(&dst->__rcuref); } if (expires) { unsigned long clock; clock = jiffies_to_clock_t(abs(expires)); clock = min_t(unsigned long, clock, INT_MAX); ci.rta_expires = (expires > 0) ? clock : -clock; } return nla_put(skb, RTA_CACHEINFO, sizeof(ci), &ci); } EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo); void netif_set_operstate(struct net_device *dev, int newstate) { unsigned int old = READ_ONCE(dev->operstate); do { if (old == newstate) return; } while (!try_cmpxchg(&dev->operstate, &old, newstate)); netif_state_change(dev); } EXPORT_SYMBOL(netif_set_operstate); static void set_operstate(struct net_device *dev, unsigned char transition) { unsigned char operstate = READ_ONCE(dev->operstate); switch (transition) { case IF_OPER_UP: if ((operstate == IF_OPER_DORMANT || operstate == IF_OPER_TESTING || operstate == IF_OPER_UNKNOWN) && !netif_dormant(dev) && !netif_testing(dev)) operstate = IF_OPER_UP; break; case IF_OPER_TESTING: if (netif_oper_up(dev)) operstate = IF_OPER_TESTING; break; case IF_OPER_DORMANT: if (netif_oper_up(dev)) operstate = IF_OPER_DORMANT; break; } netif_set_operstate(dev, operstate); } static unsigned int rtnl_dev_get_flags(const struct net_device *dev) { return (dev->flags & ~(IFF_PROMISC | IFF_ALLMULTI)) | (dev->gflags & (IFF_PROMISC | IFF_ALLMULTI)); } static unsigned int rtnl_dev_combine_flags(const struct net_device *dev, const struct ifinfomsg *ifm) { unsigned int flags = ifm->ifi_flags; /* bugwards compatibility: ifi_change == 0 is treated as ~0 */ if (ifm->ifi_change) flags = (flags & ifm->ifi_change) | (rtnl_dev_get_flags(dev) & ~ifm->ifi_change); return flags; } static void copy_rtnl_link_stats(struct rtnl_link_stats *a, const struct rtnl_link_stats64 *b) { a->rx_packets = b->rx_packets; a->tx_packets = b->tx_packets; a->rx_bytes = b->rx_bytes; a->tx_bytes = b->tx_bytes; a->rx_errors = b->rx_errors; a->tx_errors = b->tx_errors; a->rx_dropped = b->rx_dropped; a->tx_dropped = b->tx_dropped; a->multicast = b->multicast; a->collisions = b->collisions; a->rx_length_errors = b->rx_length_errors; a->rx_over_errors = b->rx_over_errors; a->rx_crc_errors = b->rx_crc_errors; a->rx_frame_errors = b->rx_frame_errors; a->rx_fifo_errors = b->rx_fifo_errors; a->rx_missed_errors = b->rx_missed_errors; a->tx_aborted_errors = b->tx_aborted_errors; a->tx_carrier_errors = b->tx_carrier_errors; a->tx_fifo_errors = b->tx_fifo_errors; a->tx_heartbeat_errors = b->tx_heartbeat_errors; a->tx_window_errors = b->tx_window_errors; a->rx_compressed = b->rx_compressed; a->tx_compressed = b->tx_compressed; a->rx_nohandler = b->rx_nohandler; } /* All VF info */ static inline int rtnl_vfinfo_size(const struct net_device *dev, u32 ext_filter_mask) { if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF)) { int num_vfs = dev_num_vf(dev->dev.parent); size_t size = nla_total_size(0); size += num_vfs * (nla_total_size(0) + nla_total_size(sizeof(struct ifla_vf_mac)) + nla_total_size(sizeof(struct ifla_vf_broadcast)) + nla_total_size(sizeof(struct ifla_vf_vlan)) + nla_total_size(0) + /* nest IFLA_VF_VLAN_LIST */ nla_total_size(MAX_VLAN_LIST_LEN * sizeof(struct ifla_vf_vlan_info)) + nla_total_size(sizeof(struct ifla_vf_spoofchk)) + nla_total_size(sizeof(struct ifla_vf_tx_rate)) + nla_total_size(sizeof(struct ifla_vf_rate)) + nla_total_size(sizeof(struct ifla_vf_link_state)) + nla_total_size(sizeof(struct ifla_vf_rss_query_en)) + nla_total_size(sizeof(struct ifla_vf_trust))); if (~ext_filter_mask & RTEXT_FILTER_SKIP_STATS) { size += num_vfs * (nla_total_size(0) + /* nest IFLA_VF_STATS */ /* IFLA_VF_STATS_RX_PACKETS */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_TX_PACKETS */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_RX_BYTES */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_TX_BYTES */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_BROADCAST */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_MULTICAST */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_RX_DROPPED */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_TX_DROPPED */ nla_total_size_64bit(sizeof(__u64))); } if (dev->netdev_ops->ndo_get_vf_guid) size += num_vfs * 2 * nla_total_size(sizeof(struct ifla_vf_guid)); return size; } else return 0; } static size_t rtnl_port_size(const struct net_device *dev, u32 ext_filter_mask) { size_t port_size = nla_total_size(4) /* PORT_VF */ + nla_total_size(PORT_PROFILE_MAX) /* PORT_PROFILE */ + nla_total_size(PORT_UUID_MAX) /* PORT_INSTANCE_UUID */ + nla_total_size(PORT_UUID_MAX) /* PORT_HOST_UUID */ + nla_total_size(1) /* PROT_VDP_REQUEST */ + nla_total_size(2); /* PORT_VDP_RESPONSE */ size_t vf_ports_size = nla_total_size(sizeof(struct nlattr)); size_t vf_port_size = nla_total_size(sizeof(struct nlattr)) + port_size; size_t port_self_size = nla_total_size(sizeof(struct nlattr)) + port_size; if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent || !(ext_filter_mask & RTEXT_FILTER_VF)) return 0; if (dev_num_vf(dev->dev.parent)) return port_self_size + vf_ports_size + vf_port_size * dev_num_vf(dev->dev.parent); else return port_self_size; } static size_t rtnl_xdp_size(void) { size_t xdp_size = nla_total_size(0) + /* nest IFLA_XDP */ nla_total_size(1) + /* XDP_ATTACHED */ nla_total_size(4) + /* XDP_PROG_ID (or 1st mode) */ nla_total_size(4); /* XDP_<mode>_PROG_ID */ return xdp_size; } static size_t rtnl_prop_list_size(const struct net_device *dev) { struct netdev_name_node *name_node; unsigned int cnt = 0; rcu_read_lock(); list_for_each_entry_rcu(name_node, &dev->name_node->list, list) cnt++; rcu_read_unlock(); if (!cnt) return 0; return nla_total_size(0) + cnt * nla_total_size(ALTIFNAMSIZ); } static size_t rtnl_proto_down_size(const struct net_device *dev) { size_t size = nla_total_size(1); /* Assume dev->proto_down_reason is not zero. */ size += nla_total_size(0) + nla_total_size(4); return size; } static size_t rtnl_devlink_port_size(const struct net_device *dev) { size_t size = nla_total_size(0); /* nest IFLA_DEVLINK_PORT */ if (dev->devlink_port) size += devlink_nl_port_handle_size(dev->devlink_port); return size; } static size_t rtnl_dpll_pin_size(const struct net_device *dev) { size_t size = nla_total_size(0); /* nest IFLA_DPLL_PIN */ size += dpll_netdev_pin_handle_size(dev); return size; } static noinline size_t if_nlmsg_size(const struct net_device *dev, u32 ext_filter_mask) { return NLMSG_ALIGN(sizeof(struct ifinfomsg)) + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ + nla_total_size(IFALIASZ) /* IFLA_IFALIAS */ + nla_total_size(IFNAMSIZ) /* IFLA_QDISC */ + nla_total_size_64bit(sizeof(struct rtnl_link_ifmap)) + nla_total_size(sizeof(struct rtnl_link_stats)) + nla_total_size_64bit(sizeof(struct rtnl_link_stats64)) + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ + nla_total_size(MAX_ADDR_LEN) /* IFLA_BROADCAST */ + nla_total_size(4) /* IFLA_TXQLEN */ + nla_total_size(4) /* IFLA_WEIGHT */ + nla_total_size(4) /* IFLA_MTU */ + nla_total_size(4) /* IFLA_LINK */ + nla_total_size(4) /* IFLA_MASTER */ + nla_total_size(1) /* IFLA_CARRIER */ + nla_total_size(4) /* IFLA_PROMISCUITY */ + nla_total_size(4) /* IFLA_ALLMULTI */ + nla_total_size(4) /* IFLA_NUM_TX_QUEUES */ + nla_total_size(4) /* IFLA_NUM_RX_QUEUES */ + nla_total_size(4) /* IFLA_GSO_MAX_SEGS */ + nla_total_size(4) /* IFLA_GSO_MAX_SIZE */ + nla_total_size(4) /* IFLA_GRO_MAX_SIZE */ + nla_total_size(4) /* IFLA_GSO_IPV4_MAX_SIZE */ + nla_total_size(4) /* IFLA_GRO_IPV4_MAX_SIZE */ + nla_total_size(4) /* IFLA_TSO_MAX_SIZE */ + nla_total_size(4) /* IFLA_TSO_MAX_SEGS */ + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(1) /* IFLA_LINKMODE */ + nla_total_size(1) /* IFLA_NETNS_IMMUTABLE */ + nla_total_size(4) /* IFLA_CARRIER_CHANGES */ + nla_total_size(4) /* IFLA_LINK_NETNSID */ + nla_total_size(4) /* IFLA_GROUP */ + nla_total_size(ext_filter_mask & RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */ + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */ + rtnl_port_size(dev, ext_filter_mask) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ + rtnl_link_get_size(dev) /* IFLA_LINKINFO */ + rtnl_link_get_af_size(dev, ext_filter_mask) /* IFLA_AF_SPEC */ + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */ + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */ + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */ + rtnl_xdp_size() /* IFLA_XDP */ + nla_total_size(4) /* IFLA_EVENT */ + nla_total_size(4) /* IFLA_NEW_NETNSID */ + nla_total_size(4) /* IFLA_NEW_IFINDEX */ + rtnl_proto_down_size(dev) /* proto down */ + nla_total_size(4) /* IFLA_TARGET_NETNSID */ + nla_total_size(4) /* IFLA_CARRIER_UP_COUNT */ + nla_total_size(4) /* IFLA_CARRIER_DOWN_COUNT */ + nla_total_size(4) /* IFLA_MIN_MTU */ + nla_total_size(4) /* IFLA_MAX_MTU */ + rtnl_prop_list_size(dev) + nla_total_size(MAX_ADDR_LEN) /* IFLA_PERM_ADDRESS */ + rtnl_devlink_port_size(dev) + rtnl_dpll_pin_size(dev) + nla_total_size(8) /* IFLA_MAX_PACING_OFFLOAD_HORIZON */ + 0; } static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev) { struct nlattr *vf_ports; struct nlattr *vf_port; int vf; int err; vf_ports = nla_nest_start_noflag(skb, IFLA_VF_PORTS); if (!vf_ports) return -EMSGSIZE; for (vf = 0; vf < dev_num_vf(dev->dev.parent); vf++) { vf_port = nla_nest_start_noflag(skb, IFLA_VF_PORT); if (!vf_port) goto nla_put_failure; if (nla_put_u32(skb, IFLA_PORT_VF, vf)) goto nla_put_failure; err = dev->netdev_ops->ndo_get_vf_port(dev, vf, skb); if (err == -EMSGSIZE) goto nla_put_failure; if (err) { nla_nest_cancel(skb, vf_port); continue; } nla_nest_end(skb, vf_port); } nla_nest_end(skb, vf_ports); return 0; nla_put_failure: nla_nest_cancel(skb, vf_ports); return -EMSGSIZE; } static int rtnl_port_self_fill(struct sk_buff *skb, struct net_device *dev) { struct nlattr *port_self; int err; port_self = nla_nest_start_noflag(skb, IFLA_PORT_SELF); if (!port_self) return -EMSGSIZE; err = dev->netdev_ops->ndo_get_vf_port(dev, PORT_SELF_VF, skb); if (err) { nla_nest_cancel(skb, port_self); return (err == -EMSGSIZE) ? err : 0; } nla_nest_end(skb, port_self); return 0; } static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev, u32 ext_filter_mask) { int err; if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent || !(ext_filter_mask & RTEXT_FILTER_VF)) return 0; err = rtnl_port_self_fill(skb, dev); if (err) return err; if (dev_num_vf(dev->dev.parent)) { err = rtnl_vf_ports_fill(skb, dev); if (err) return err; } return 0; } static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev) { int err; struct netdev_phys_item_id ppid; err = dev_get_phys_port_id(dev, &ppid); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } if (nla_put(skb, IFLA_PHYS_PORT_ID, ppid.id_len, ppid.id)) return -EMSGSIZE; return 0; } static int rtnl_phys_port_name_fill(struct sk_buff *skb, struct net_device *dev) { char name[IFNAMSIZ]; int err; err = dev_get_phys_port_name(dev, name, sizeof(name)); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } if (nla_put_string(skb, IFLA_PHYS_PORT_NAME, name)) return -EMSGSIZE; return 0; } static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev) { struct netdev_phys_item_id ppid = { }; int err; err = dev_get_port_parent_id(dev, &ppid, false); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } if (nla_put(skb, IFLA_PHYS_SWITCH_ID, ppid.id_len, ppid.id)) return -EMSGSIZE; return 0; } static noinline_for_stack int rtnl_fill_stats(struct sk_buff *skb, struct net_device *dev) { struct rtnl_link_stats64 *sp; struct nlattr *attr; attr = nla_reserve_64bit(skb, IFLA_STATS64, sizeof(struct rtnl_link_stats64), IFLA_PAD); if (!attr) return -EMSGSIZE; sp = nla_data(attr); dev_get_stats(dev, sp); attr = nla_reserve(skb, IFLA_STATS, sizeof(struct rtnl_link_stats)); if (!attr) return -EMSGSIZE; copy_rtnl_link_stats(nla_data(attr), sp); return 0; } static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, struct net_device *dev, int vfs_num, u32 ext_filter_mask) { struct ifla_vf_rss_query_en vf_rss_query_en; struct nlattr *vf, *vfstats, *vfvlanlist; struct ifla_vf_link_state vf_linkstate; struct ifla_vf_vlan_info vf_vlan_info; struct ifla_vf_spoofchk vf_spoofchk; struct ifla_vf_tx_rate vf_tx_rate; struct ifla_vf_stats vf_stats; struct ifla_vf_trust vf_trust; struct ifla_vf_vlan vf_vlan; struct ifla_vf_rate vf_rate; struct ifla_vf_mac vf_mac; struct ifla_vf_broadcast vf_broadcast; struct ifla_vf_info ivi; struct ifla_vf_guid node_guid; struct ifla_vf_guid port_guid; memset(&ivi, 0, sizeof(ivi)); /* Not all SR-IOV capable drivers support the * spoofcheck and "RSS query enable" query. Preset to * -1 so the user space tool can detect that the driver * didn't report anything. */ ivi.spoofchk = -1; ivi.rss_query_en = -1; ivi.trusted = -1; /* The default value for VF link state is "auto" * IFLA_VF_LINK_STATE_AUTO which equals zero */ ivi.linkstate = 0; /* VLAN Protocol by default is 802.1Q */ ivi.vlan_proto = htons(ETH_P_8021Q); if (dev->netdev_ops->ndo_get_vf_config(dev, vfs_num, &ivi)) return 0; memset(&vf_vlan_info, 0, sizeof(vf_vlan_info)); memset(&node_guid, 0, sizeof(node_guid)); memset(&port_guid, 0, sizeof(port_guid)); vf_mac.vf = vf_vlan.vf = vf_vlan_info.vf = vf_rate.vf = vf_tx_rate.vf = vf_spoofchk.vf = vf_linkstate.vf = vf_rss_query_en.vf = vf_trust.vf = node_guid.vf = port_guid.vf = ivi.vf; memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); memcpy(vf_broadcast.broadcast, dev->broadcast, dev->addr_len); vf_vlan.vlan = ivi.vlan; vf_vlan.qos = ivi.qos; vf_vlan_info.vlan = ivi.vlan; vf_vlan_info.qos = ivi.qos; vf_vlan_info.vlan_proto = ivi.vlan_proto; vf_tx_rate.rate = ivi.max_tx_rate; vf_rate.min_tx_rate = ivi.min_tx_rate; vf_rate.max_tx_rate = ivi.max_tx_rate; vf_spoofchk.setting = ivi.spoofchk; vf_linkstate.link_state = ivi.linkstate; vf_rss_query_en.setting = ivi.rss_query_en; vf_trust.setting = ivi.trusted; vf = nla_nest_start_noflag(skb, IFLA_VF_INFO); if (!vf) return -EMSGSIZE; if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) || nla_put(skb, IFLA_VF_BROADCAST, sizeof(vf_broadcast), &vf_broadcast) || nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) || nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate), &vf_rate) || nla_put(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate), &vf_tx_rate) || nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk), &vf_spoofchk) || nla_put(skb, IFLA_VF_LINK_STATE, sizeof(vf_linkstate), &vf_linkstate) || nla_put(skb, IFLA_VF_RSS_QUERY_EN, sizeof(vf_rss_query_en), &vf_rss_query_en) || nla_put(skb, IFLA_VF_TRUST, sizeof(vf_trust), &vf_trust)) goto nla_put_vf_failure; if (dev->netdev_ops->ndo_get_vf_guid && !dev->netdev_ops->ndo_get_vf_guid(dev, vfs_num, &node_guid, &port_guid)) { if (nla_put(skb, IFLA_VF_IB_NODE_GUID, sizeof(node_guid), &node_guid) || nla_put(skb, IFLA_VF_IB_PORT_GUID, sizeof(port_guid), &port_guid)) goto nla_put_vf_failure; } vfvlanlist = nla_nest_start_noflag(skb, IFLA_VF_VLAN_LIST); if (!vfvlanlist) goto nla_put_vf_failure; if (nla_put(skb, IFLA_VF_VLAN_INFO, sizeof(vf_vlan_info), &vf_vlan_info)) { nla_nest_cancel(skb, vfvlanlist); goto nla_put_vf_failure; } nla_nest_end(skb, vfvlanlist); if (~ext_filter_mask & RTEXT_FILTER_SKIP_STATS) { memset(&vf_stats, 0, sizeof(vf_stats)); if (dev->netdev_ops->ndo_get_vf_stats) dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num, &vf_stats); vfstats = nla_nest_start_noflag(skb, IFLA_VF_STATS); if (!vfstats) goto nla_put_vf_failure; if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS, vf_stats.rx_packets, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS, vf_stats.tx_packets, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_BYTES, vf_stats.rx_bytes, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_BYTES, vf_stats.tx_bytes, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST, vf_stats.broadcast, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST, vf_stats.multicast, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_DROPPED, vf_stats.rx_dropped, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_DROPPED, vf_stats.tx_dropped, IFLA_VF_STATS_PAD)) { nla_nest_cancel(skb, vfstats); goto nla_put_vf_failure; } nla_nest_end(skb, vfstats); } nla_nest_end(skb, vf); return 0; nla_put_vf_failure: nla_nest_cancel(skb, vf); return -EMSGSIZE; } static noinline_for_stack int rtnl_fill_vf(struct sk_buff *skb, struct net_device *dev, u32 ext_filter_mask) { struct nlattr *vfinfo; int i, num_vfs; if (!dev->dev.parent || ((ext_filter_mask & RTEXT_FILTER_VF) == 0)) return 0; num_vfs = dev_num_vf(dev->dev.parent); if (nla_put_u32(skb, IFLA_NUM_VF, num_vfs)) return -EMSGSIZE; if (!dev->netdev_ops->ndo_get_vf_config) return 0; vfinfo = nla_nest_start_noflag(skb, IFLA_VFINFO_LIST); if (!vfinfo) return -EMSGSIZE; for (i = 0; i < num_vfs; i++) { if (rtnl_fill_vfinfo(skb, dev, i, ext_filter_mask)) { nla_nest_cancel(skb, vfinfo); return -EMSGSIZE; } } nla_nest_end(skb, vfinfo); return 0; } static int rtnl_fill_link_ifmap(struct sk_buff *skb, const struct net_device *dev) { struct rtnl_link_ifmap map; memset(&map, 0, sizeof(map)); map.mem_start = READ_ONCE(dev->mem_start); map.mem_end = READ_ONCE(dev->mem_end); map.base_addr = READ_ONCE(dev->base_addr); map.irq = READ_ONCE(dev->irq); map.dma = READ_ONCE(dev->dma); map.port = READ_ONCE(dev->if_port); if (nla_put_64bit(skb, IFLA_MAP, sizeof(map), &map, IFLA_PAD)) return -EMSGSIZE; return 0; } static u32 rtnl_xdp_prog_skb(struct net_device *dev) { const struct bpf_prog *generic_xdp_prog; u32 res = 0; rcu_read_lock(); generic_xdp_prog = rcu_dereference(dev->xdp_prog); if (generic_xdp_prog) res = generic_xdp_prog->aux->id; rcu_read_unlock(); return res; } static u32 rtnl_xdp_prog_drv(struct net_device *dev) { return dev_xdp_prog_id(dev, XDP_MODE_DRV); } static u32 rtnl_xdp_prog_hw(struct net_device *dev) { return dev_xdp_prog_id(dev, XDP_MODE_HW); } static int rtnl_xdp_report_one(struct sk_buff *skb, struct net_device *dev, u32 *prog_id, u8 *mode, u8 tgt_mode, u32 attr, u32 (*get_prog_id)(struct net_device *dev)) { u32 curr_id; int err; curr_id = get_prog_id(dev); if (!curr_id) return 0; *prog_id = curr_id; err = nla_put_u32(skb, attr, curr_id); if (err) return err; if (*mode != XDP_ATTACHED_NONE) *mode = XDP_ATTACHED_MULTI; else *mode = tgt_mode; return 0; } static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) { struct nlattr *xdp; u32 prog_id; int err; u8 mode; xdp = nla_nest_start_noflag(skb, IFLA_XDP); if (!xdp) return -EMSGSIZE; prog_id = 0; mode = XDP_ATTACHED_NONE; err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_SKB, IFLA_XDP_SKB_PROG_ID, rtnl_xdp_prog_skb); if (err) goto err_cancel; err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_DRV, IFLA_XDP_DRV_PROG_ID, rtnl_xdp_prog_drv); if (err) goto err_cancel; err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_HW, IFLA_XDP_HW_PROG_ID, rtnl_xdp_prog_hw); if (err) goto err_cancel; err = nla_put_u8(skb, IFLA_XDP_ATTACHED, mode); if (err) goto err_cancel; if (prog_id && mode != XDP_ATTACHED_MULTI) { err = nla_put_u32(skb, IFLA_XDP_PROG_ID, prog_id); if (err) goto err_cancel; } nla_nest_end(skb, xdp); return 0; err_cancel: nla_nest_cancel(skb, xdp); return err; } static u32 rtnl_get_event(unsigned long event) { u32 rtnl_event_type = IFLA_EVENT_NONE; switch (event) { case NETDEV_REBOOT: rtnl_event_type = IFLA_EVENT_REBOOT; break; case NETDEV_FEAT_CHANGE: rtnl_event_type = IFLA_EVENT_FEATURES; break; case NETDEV_BONDING_FAILOVER: rtnl_event_type = IFLA_EVENT_BONDING_FAILOVER; break; case NETDEV_NOTIFY_PEERS: rtnl_event_type = IFLA_EVENT_NOTIFY_PEERS; break; case NETDEV_RESEND_IGMP: rtnl_event_type = IFLA_EVENT_IGMP_RESEND; break; case NETDEV_CHANGEINFODATA: rtnl_event_type = IFLA_EVENT_BONDING_OPTIONS; break; default: break; } return rtnl_event_type; } static int put_master_ifindex(struct sk_buff *skb, struct net_device *dev) { const struct net_device *upper_dev; int ret = 0; rcu_read_lock(); upper_dev = netdev_master_upper_dev_get_rcu(dev); if (upper_dev) ret = nla_put_u32(skb, IFLA_MASTER, READ_ONCE(upper_dev->ifindex)); rcu_read_unlock(); return ret; } static int nla_put_iflink(struct sk_buff *skb, const struct net_device *dev, bool force) { int iflink = dev_get_iflink(dev); if (force || READ_ONCE(dev->ifindex) != iflink) return nla_put_u32(skb, IFLA_LINK, iflink); return 0; } static noinline_for_stack int nla_put_ifalias(struct sk_buff *skb, struct net_device *dev) { char buf[IFALIASZ]; int ret; ret = dev_get_alias(dev, buf, sizeof(buf)); return ret > 0 ? nla_put_string(skb, IFLA_IFALIAS, buf) : 0; } static int rtnl_fill_link_netnsid(struct sk_buff *skb, const struct net_device *dev, struct net *src_net, gfp_t gfp) { bool put_iflink = false; if (dev->rtnl_link_ops && dev->rtnl_link_ops->get_link_net) { struct net *link_net = dev->rtnl_link_ops->get_link_net(dev); if (!net_eq(dev_net(dev), link_net)) { int id = peernet2id_alloc(src_net, link_net, gfp); if (nla_put_s32(skb, IFLA_LINK_NETNSID, id)) return -EMSGSIZE; put_iflink = true; } } return nla_put_iflink(skb, dev, put_iflink); } static int rtnl_fill_link_af(struct sk_buff *skb, const struct net_device *dev, u32 ext_filter_mask) { const struct rtnl_af_ops *af_ops; struct nlattr *af_spec; af_spec = nla_nest_start_noflag(skb, IFLA_AF_SPEC); if (!af_spec) return -EMSGSIZE; list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { struct nlattr *af; int err; if (!af_ops->fill_link_af) continue; af = nla_nest_start_noflag(skb, af_ops->family); if (!af) return -EMSGSIZE; err = af_ops->fill_link_af(skb, dev, ext_filter_mask); /* * Caller may return ENODATA to indicate that there * was no data to be dumped. This is not an error, it * means we should trim the attribute header and * continue. */ if (err == -ENODATA) nla_nest_cancel(skb, af); else if (err < 0) return -EMSGSIZE; nla_nest_end(skb, af); } nla_nest_end(skb, af_spec); return 0; } static int rtnl_fill_alt_ifnames(struct sk_buff *skb, const struct net_device *dev) { struct netdev_name_node *name_node; int count = 0; list_for_each_entry_rcu(name_node, &dev->name_node->list, list) { if (nla_put_string(skb, IFLA_ALT_IFNAME, name_node->name)) return -EMSGSIZE; count++; } return count; } /* RCU protected. */ static int rtnl_fill_prop_list(struct sk_buff *skb, const struct net_device *dev) { struct nlattr *prop_list; int ret; prop_list = nla_nest_start(skb, IFLA_PROP_LIST); if (!prop_list) return -EMSGSIZE; ret = rtnl_fill_alt_ifnames(skb, dev); if (ret <= 0) goto nest_cancel; nla_nest_end(skb, prop_list); return 0; nest_cancel: nla_nest_cancel(skb, prop_list); return ret; } static int rtnl_fill_proto_down(struct sk_buff *skb, const struct net_device *dev) { struct nlattr *pr; u32 preason; if (nla_put_u8(skb, IFLA_PROTO_DOWN, READ_ONCE(dev->proto_down))) goto nla_put_failure; preason = READ_ONCE(dev->proto_down_reason); if (!preason) return 0; pr = nla_nest_start(skb, IFLA_PROTO_DOWN_REASON); if (!pr) return -EMSGSIZE; if (nla_put_u32(skb, IFLA_PROTO_DOWN_REASON_VALUE, preason)) { nla_nest_cancel(skb, pr); goto nla_put_failure; } nla_nest_end(skb, pr); return 0; nla_put_failure: return -EMSGSIZE; } static int rtnl_fill_devlink_port(struct sk_buff *skb, const struct net_device *dev) { struct nlattr *devlink_port_nest; int ret; devlink_port_nest = nla_nest_start(skb, IFLA_DEVLINK_PORT); if (!devlink_port_nest) return -EMSGSIZE; if (dev->devlink_port) { ret = devlink_nl_port_handle_fill(skb, dev->devlink_port); if (ret < 0) goto nest_cancel; } nla_nest_end(skb, devlink_port_nest); return 0; nest_cancel: nla_nest_cancel(skb, devlink_port_nest); return ret; } static int rtnl_fill_dpll_pin(struct sk_buff *skb, const struct net_device *dev) { struct nlattr *dpll_pin_nest; int ret; dpll_pin_nest = nla_nest_start(skb, IFLA_DPLL_PIN); if (!dpll_pin_nest) return -EMSGSIZE; ret = dpll_netdev_add_pin_handle(skb, dev); if (ret < 0) goto nest_cancel; nla_nest_end(skb, dpll_pin_nest); return 0; nest_cancel: nla_nest_cancel(skb, dpll_pin_nest); return ret; } static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct net *src_net, int type, u32 pid, u32 seq, u32 change, unsigned int flags, u32 ext_filter_mask, u32 event, int *new_nsid, int new_ifindex, int tgt_netnsid, gfp_t gfp) { char devname[IFNAMSIZ]; struct ifinfomsg *ifm; struct nlmsghdr *nlh; struct Qdisc *qdisc; ASSERT_RTNL(); nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags); if (nlh == NULL) return -EMSGSIZE; ifm = nlmsg_data(nlh); ifm->ifi_family = AF_UNSPEC; ifm->__ifi_pad = 0; ifm->ifi_type = READ_ONCE(dev->type); ifm->ifi_index = READ_ONCE(dev->ifindex); ifm->ifi_flags = dev_get_flags(dev); ifm->ifi_change = change; if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_TARGET_NETNSID, tgt_netnsid)) goto nla_put_failure; netdev_copy_name(dev, devname); if (nla_put_string(skb, IFLA_IFNAME, devname)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_TXQLEN, READ_ONCE(dev->tx_queue_len)) || nla_put_u8(skb, IFLA_OPERSTATE, netif_running(dev) ? READ_ONCE(dev->operstate) : IF_OPER_DOWN) || nla_put_u8(skb, IFLA_LINKMODE, READ_ONCE(dev->link_mode)) || nla_put_u8(skb, IFLA_NETNS_IMMUTABLE, dev->netns_immutable) || nla_put_u32(skb, IFLA_MTU, READ_ONCE(dev->mtu)) || nla_put_u32(skb, IFLA_MIN_MTU, READ_ONCE(dev->min_mtu)) || nla_put_u32(skb, IFLA_MAX_MTU, READ_ONCE(dev->max_mtu)) || nla_put_u32(skb, IFLA_GROUP, READ_ONCE(dev->group)) || nla_put_u32(skb, IFLA_PROMISCUITY, READ_ONCE(dev->promiscuity)) || nla_put_u32(skb, IFLA_ALLMULTI, READ_ONCE(dev->allmulti)) || nla_put_u32(skb, IFLA_NUM_TX_QUEUES, READ_ONCE(dev->num_tx_queues)) || nla_put_u32(skb, IFLA_GSO_MAX_SEGS, READ_ONCE(dev->gso_max_segs)) || nla_put_u32(skb, IFLA_GSO_MAX_SIZE, READ_ONCE(dev->gso_max_size)) || nla_put_u32(skb, IFLA_GRO_MAX_SIZE, READ_ONCE(dev->gro_max_size)) || nla_put_u32(skb, IFLA_GSO_IPV4_MAX_SIZE, READ_ONCE(dev->gso_ipv4_max_size)) || nla_put_u32(skb, IFLA_GRO_IPV4_MAX_SIZE, READ_ONCE(dev->gro_ipv4_max_size)) || nla_put_u32(skb, IFLA_TSO_MAX_SIZE, READ_ONCE(dev->tso_max_size)) || nla_put_u32(skb, IFLA_TSO_MAX_SEGS, READ_ONCE(dev->tso_max_segs)) || nla_put_uint(skb, IFLA_MAX_PACING_OFFLOAD_HORIZON, READ_ONCE(dev->max_pacing_offload_horizon)) || #ifdef CONFIG_RPS nla_put_u32(skb, IFLA_NUM_RX_QUEUES, READ_ONCE(dev->num_rx_queues)) || #endif put_master_ifindex(skb, dev) || nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) || nla_put_ifalias(skb, dev) || nla_put_u32(skb, IFLA_CARRIER_CHANGES, atomic_read(&dev->carrier_up_count) + atomic_read(&dev->carrier_down_count)) || nla_put_u32(skb, IFLA_CARRIER_UP_COUNT, atomic_read(&dev->carrier_up_count)) || nla_put_u32(skb, IFLA_CARRIER_DOWN_COUNT, atomic_read(&dev->carrier_down_count))) goto nla_put_failure; if (rtnl_fill_proto_down(skb, dev)) goto nla_put_failure; if (event != IFLA_EVENT_NONE) { if (nla_put_u32(skb, IFLA_EVENT, event)) goto nla_put_failure; } if (dev->addr_len) { if (nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr) || nla_put(skb, IFLA_BROADCAST, dev->addr_len, dev->broadcast)) goto nla_put_failure; } if (rtnl_phys_port_id_fill(skb, dev)) goto nla_put_failure; if (rtnl_phys_port_name_fill(skb, dev)) goto nla_put_failure; if (rtnl_phys_switch_id_fill(skb, dev)) goto nla_put_failure; if (rtnl_fill_stats(skb, dev)) goto nla_put_failure; if (rtnl_fill_vf(skb, dev, ext_filter_mask)) goto nla_put_failure; if (rtnl_port_fill(skb, dev, ext_filter_mask)) goto nla_put_failure; if (rtnl_xdp_fill(skb, dev)) goto nla_put_failure; if (dev->rtnl_link_ops || rtnl_have_link_slave_info(dev)) { if (rtnl_link_fill(skb, dev) < 0) goto nla_put_failure; } if (new_nsid && nla_put_s32(skb, IFLA_NEW_NETNSID, *new_nsid) < 0) goto nla_put_failure; if (new_ifindex && nla_put_s32(skb, IFLA_NEW_IFINDEX, new_ifindex) < 0) goto nla_put_failure; if (memchr_inv(dev->perm_addr, '\0', dev->addr_len) && nla_put(skb, IFLA_PERM_ADDRESS, dev->addr_len, dev->perm_addr)) goto nla_put_failure; rcu_read_lock(); if (rtnl_fill_link_netnsid(skb, dev, src_net, GFP_ATOMIC)) goto nla_put_failure_rcu; qdisc = rcu_dereference(dev->qdisc); if (qdisc && nla_put_string(skb, IFLA_QDISC, qdisc->ops->id)) goto nla_put_failure_rcu; if (rtnl_fill_link_af(skb, dev, ext_filter_mask)) goto nla_put_failure_rcu; if (rtnl_fill_link_ifmap(skb, dev)) goto nla_put_failure_rcu; if (rtnl_fill_prop_list(skb, dev)) goto nla_put_failure_rcu; rcu_read_unlock(); if (dev->dev.parent && nla_put_string(skb, IFLA_PARENT_DEV_NAME, dev_name(dev->dev.parent))) goto nla_put_failure; if (dev->dev.parent && dev->dev.parent->bus && nla_put_string(skb, IFLA_PARENT_DEV_BUS_NAME, dev->dev.parent->bus->name)) goto nla_put_failure; if (rtnl_fill_devlink_port(skb, dev)) goto nla_put_failure; if (rtnl_fill_dpll_pin(skb, dev)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure_rcu: rcu_read_unlock(); nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_UNSPEC] = { .strict_start_type = IFLA_DPLL_PIN }, [IFLA_IFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ-1 }, [IFLA_ADDRESS] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [IFLA_BROADCAST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [IFLA_MAP] = { .len = sizeof(struct rtnl_link_ifmap) }, [IFLA_MTU] = { .type = NLA_U32 }, [IFLA_LINK] = { .type = NLA_U32 }, [IFLA_MASTER] = { .type = NLA_U32 }, [IFLA_CARRIER] = { .type = NLA_U8 }, [IFLA_TXQLEN] = { .type = NLA_U32 }, [IFLA_WEIGHT] = { .type = NLA_U32 }, [IFLA_OPERSTATE] = { .type = NLA_U8 }, [IFLA_LINKMODE] = { .type = NLA_U8 }, [IFLA_LINKINFO] = { .type = NLA_NESTED }, [IFLA_NET_NS_PID] = { .type = NLA_U32 }, [IFLA_NET_NS_FD] = { .type = NLA_U32 }, /* IFLA_IFALIAS is a string, but policy is set to NLA_BINARY to * allow 0-length string (needed to remove an alias). */ [IFLA_IFALIAS] = { .type = NLA_BINARY, .len = IFALIASZ - 1 }, [IFLA_VFINFO_LIST] = {. type = NLA_NESTED }, [IFLA_VF_PORTS] = { .type = NLA_NESTED }, [IFLA_PORT_SELF] = { .type = NLA_NESTED }, [IFLA_AF_SPEC] = { .type = NLA_NESTED }, [IFLA_EXT_MASK] = { .type = NLA_U32 }, [IFLA_PROMISCUITY] = { .type = NLA_U32 }, [IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 }, [IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 }, [IFLA_GSO_MAX_SEGS] = { .type = NLA_U32 }, [IFLA_GSO_MAX_SIZE] = NLA_POLICY_MIN(NLA_U32, MAX_TCP_HEADER + 1), [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, [IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */ [IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, [IFLA_LINK_NETNSID] = { .type = NLA_S32 }, [IFLA_PROTO_DOWN] = { .type = NLA_U8 }, [IFLA_XDP] = { .type = NLA_NESTED }, [IFLA_EVENT] = { .type = NLA_U32 }, [IFLA_GROUP] = { .type = NLA_U32 }, [IFLA_TARGET_NETNSID] = { .type = NLA_S32 }, [IFLA_CARRIER_UP_COUNT] = { .type = NLA_U32 }, [IFLA_CARRIER_DOWN_COUNT] = { .type = NLA_U32 }, [IFLA_MIN_MTU] = { .type = NLA_U32 }, [IFLA_MAX_MTU] = { .type = NLA_U32 }, [IFLA_PROP_LIST] = { .type = NLA_NESTED }, [IFLA_ALT_IFNAME] = { .type = NLA_STRING, .len = ALTIFNAMSIZ - 1 }, [IFLA_PERM_ADDRESS] = { .type = NLA_REJECT }, [IFLA_PROTO_DOWN_REASON] = { .type = NLA_NESTED }, [IFLA_NEW_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 1), [IFLA_PARENT_DEV_NAME] = { .type = NLA_NUL_STRING }, [IFLA_GRO_MAX_SIZE] = { .type = NLA_U32 }, [IFLA_TSO_MAX_SIZE] = { .type = NLA_REJECT }, [IFLA_TSO_MAX_SEGS] = { .type = NLA_REJECT }, [IFLA_ALLMULTI] = { .type = NLA_REJECT }, [IFLA_GSO_IPV4_MAX_SIZE] = NLA_POLICY_MIN(NLA_U32, MAX_TCP_HEADER + 1), [IFLA_GRO_IPV4_MAX_SIZE] = { .type = NLA_U32 }, [IFLA_NETNS_IMMUTABLE] = { .type = NLA_REJECT }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { [IFLA_INFO_KIND] = { .type = NLA_STRING }, [IFLA_INFO_DATA] = { .type = NLA_NESTED }, [IFLA_INFO_SLAVE_KIND] = { .type = NLA_STRING }, [IFLA_INFO_SLAVE_DATA] = { .type = NLA_NESTED }, }; static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = { [IFLA_VF_MAC] = { .len = sizeof(struct ifla_vf_mac) }, [IFLA_VF_BROADCAST] = { .type = NLA_REJECT }, [IFLA_VF_VLAN] = { .len = sizeof(struct ifla_vf_vlan) }, [IFLA_VF_VLAN_LIST] = { .type = NLA_NESTED }, [IFLA_VF_TX_RATE] = { .len = sizeof(struct ifla_vf_tx_rate) }, [IFLA_VF_SPOOFCHK] = { .len = sizeof(struct ifla_vf_spoofchk) }, [IFLA_VF_RATE] = { .len = sizeof(struct ifla_vf_rate) }, [IFLA_VF_LINK_STATE] = { .len = sizeof(struct ifla_vf_link_state) }, [IFLA_VF_RSS_QUERY_EN] = { .len = sizeof(struct ifla_vf_rss_query_en) }, [IFLA_VF_STATS] = { .type = NLA_NESTED }, [IFLA_VF_TRUST] = { .len = sizeof(struct ifla_vf_trust) }, [IFLA_VF_IB_NODE_GUID] = { .len = sizeof(struct ifla_vf_guid) }, [IFLA_VF_IB_PORT_GUID] = { .len = sizeof(struct ifla_vf_guid) }, }; static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { [IFLA_PORT_VF] = { .type = NLA_U32 }, [IFLA_PORT_PROFILE] = { .type = NLA_STRING, .len = PORT_PROFILE_MAX }, [IFLA_PORT_INSTANCE_UUID] = { .type = NLA_BINARY, .len = PORT_UUID_MAX }, [IFLA_PORT_HOST_UUID] = { .type = NLA_STRING, .len = PORT_UUID_MAX }, [IFLA_PORT_REQUEST] = { .type = NLA_U8, }, [IFLA_PORT_RESPONSE] = { .type = NLA_U16, }, /* Unused, but we need to keep it here since user space could * fill it. It's also broken with regard to NLA_BINARY use in * combination with structs. */ [IFLA_PORT_VSI_TYPE] = { .type = NLA_BINARY, .len = sizeof(struct ifla_port_vsi) }, }; static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = { [IFLA_XDP_UNSPEC] = { .strict_start_type = IFLA_XDP_EXPECTED_FD }, [IFLA_XDP_FD] = { .type = NLA_S32 }, [IFLA_XDP_EXPECTED_FD] = { .type = NLA_S32 }, [IFLA_XDP_ATTACHED] = { .type = NLA_U8 }, [IFLA_XDP_FLAGS] = { .type = NLA_U32 }, [IFLA_XDP_PROG_ID] = { .type = NLA_U32 }, }; static struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla, int *ops_srcu_index) { struct nlattr *linfo[IFLA_INFO_MAX + 1]; struct rtnl_link_ops *ops = NULL; if (nla_parse_nested_deprecated(linfo, IFLA_INFO_MAX, nla, ifla_info_policy, NULL) < 0) return NULL; if (linfo[IFLA_INFO_KIND]) { char kind[MODULE_NAME_LEN]; nla_strscpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind)); ops = rtnl_link_ops_get(kind, ops_srcu_index); } return ops; } static bool link_master_filtered(struct net_device *dev, int master_idx) { struct net_device *master; if (!master_idx) return false; master = netdev_master_upper_dev_get(dev); /* 0 is already used to denote IFLA_MASTER wasn't passed, therefore need * another invalid value for ifindex to denote "no master". */ if (master_idx == -1) return !!master; if (!master || master->ifindex != master_idx) return true; return false; } static bool link_kind_filtered(const struct net_device *dev, const struct rtnl_link_ops *kind_ops) { if (kind_ops && dev->rtnl_link_ops != kind_ops) return true; return false; } static bool link_dump_filtered(struct net_device *dev, int master_idx, const struct rtnl_link_ops *kind_ops) { if (link_master_filtered(dev, master_idx) || link_kind_filtered(dev, kind_ops)) return true; return false; } /** * rtnl_get_net_ns_capable - Get netns if sufficiently privileged. * @sk: netlink socket * @netnsid: network namespace identifier * * Returns the network namespace identified by netnsid on success or an error * pointer on failure. */ struct net *rtnl_get_net_ns_capable(struct sock *sk, int netnsid) { struct net *net; net = get_net_ns_by_id(sock_net(sk), netnsid); if (!net) return ERR_PTR(-EINVAL); /* For now, the caller is required to have CAP_NET_ADMIN in * the user namespace owning the target net ns. */ if (!sk_ns_capable(sk, net->user_ns, CAP_NET_ADMIN)) { put_net(net); return ERR_PTR(-EACCES); } return net; } EXPORT_SYMBOL_GPL(rtnl_get_net_ns_capable); static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh, bool strict_check, struct nlattr **tb, struct netlink_ext_ack *extack) { int hdrlen; if (strict_check) { struct ifinfomsg *ifm; ifm = nlmsg_payload(nlh, sizeof(*ifm)); if (!ifm) { NL_SET_ERR_MSG(extack, "Invalid header for link dump"); return -EINVAL; } if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change) { NL_SET_ERR_MSG(extack, "Invalid values in header for link dump request"); return -EINVAL; } if (ifm->ifi_index) { NL_SET_ERR_MSG(extack, "Filter by device index not supported for link dumps"); return -EINVAL; } return nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); } /* A hack to preserve kernel<->userspace interface. * The correct header is ifinfomsg. It is consistent with rtnl_getlink. * However, before Linux v3.9 the code here assumed rtgenmsg and that's * what iproute2 < v3.9.0 used. * We can detect the old iproute2. Even including the IFLA_EXT_MASK * attribute, its netlink message is shorter than struct ifinfomsg. */ hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ? sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg); return nlmsg_parse_deprecated(nlh, hdrlen, tb, IFLA_MAX, ifla_policy, extack); } static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; struct rtnl_link_ops *kind_ops = NULL; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); unsigned int flags = NLM_F_MULTI; struct nlattr *tb[IFLA_MAX+1]; struct { unsigned long ifindex; } *ctx = (void *)cb->ctx; struct net *tgt_net = net; u32 ext_filter_mask = 0; struct net_device *dev; int ops_srcu_index; int master_idx = 0; int netnsid = -1; int err, i; err = rtnl_valid_dump_ifinfo_req(nlh, cb->strict_check, tb, extack); if (err < 0) { if (cb->strict_check) return err; goto walk_entries; } for (i = 0; i <= IFLA_MAX; ++i) { if (!tb[i]) continue; /* new attributes should only be added with strict checking */ switch (i) { case IFLA_TARGET_NETNSID: netnsid = nla_get_s32(tb[i]); tgt_net = rtnl_get_net_ns_capable(skb->sk, netnsid); if (IS_ERR(tgt_net)) { NL_SET_ERR_MSG(extack, "Invalid target network namespace id"); err = PTR_ERR(tgt_net); netnsid = -1; goto out; } break; case IFLA_EXT_MASK: ext_filter_mask = nla_get_u32(tb[i]); break; case IFLA_MASTER: master_idx = nla_get_u32(tb[i]); break; case IFLA_LINKINFO: kind_ops = linkinfo_to_kind_ops(tb[i], &ops_srcu_index); break; default: if (cb->strict_check) { NL_SET_ERR_MSG(extack, "Unsupported attribute in link dump request"); err = -EINVAL; goto out; } } } if (master_idx || kind_ops) flags |= NLM_F_DUMP_FILTERED; walk_entries: err = 0; for_each_netdev_dump(tgt_net, dev, ctx->ifindex) { if (link_dump_filtered(dev, master_idx, kind_ops)) continue; err = rtnl_fill_ifinfo(skb, dev, net, RTM_NEWLINK, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, 0, flags, ext_filter_mask, 0, NULL, 0, netnsid, GFP_KERNEL); if (err < 0) break; } cb->seq = tgt_net->dev_base_seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); out: if (kind_ops) rtnl_link_ops_put(kind_ops, ops_srcu_index); if (netnsid >= 0) put_net(tgt_net); return err; } int rtnl_nla_parse_ifinfomsg(struct nlattr **tb, const struct nlattr *nla_peer, struct netlink_ext_ack *exterr) { const struct ifinfomsg *ifmp; const struct nlattr *attrs; size_t len; ifmp = nla_data(nla_peer); attrs = nla_data(nla_peer) + sizeof(struct ifinfomsg); len = nla_len(nla_peer) - sizeof(struct ifinfomsg); if (ifmp->ifi_index < 0) { NL_SET_ERR_MSG_ATTR(exterr, nla_peer, "ifindex can't be negative"); return -EINVAL; } return nla_parse_deprecated(tb, IFLA_MAX, attrs, len, ifla_policy, exterr); } EXPORT_SYMBOL(rtnl_nla_parse_ifinfomsg); static struct net *rtnl_link_get_net_ifla(struct nlattr *tb[]) { struct net *net = NULL; /* Examine the link attributes and figure out which * network namespace we are talking about. */ if (tb[IFLA_NET_NS_PID]) net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID])); else if (tb[IFLA_NET_NS_FD]) net = get_net_ns_by_fd(nla_get_u32(tb[IFLA_NET_NS_FD])); return net; } struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) { struct net *net = rtnl_link_get_net_ifla(tb); if (!net) net = get_net(src_net); return net; } EXPORT_SYMBOL(rtnl_link_get_net); /* Figure out which network namespace we are talking about by * examining the link attributes in the following order: * * 1. IFLA_NET_NS_PID * 2. IFLA_NET_NS_FD * 3. IFLA_TARGET_NETNSID */ static struct net *rtnl_link_get_net_by_nlattr(struct net *src_net, struct nlattr *tb[]) { struct net *net; if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]) return rtnl_link_get_net(src_net, tb); if (!tb[IFLA_TARGET_NETNSID]) return get_net(src_net); net = get_net_ns_by_id(src_net, nla_get_u32(tb[IFLA_TARGET_NETNSID])); if (!net) return ERR_PTR(-EINVAL); return net; } static struct net *rtnl_link_get_net_capable(const struct sk_buff *skb, struct net *src_net, struct nlattr *tb[], int cap) { struct net *net; net = rtnl_link_get_net_by_nlattr(src_net, tb); if (IS_ERR(net)) return net; if (!netlink_ns_capable(skb, net->user_ns, cap)) { put_net(net); return ERR_PTR(-EPERM); } return net; } /* Verify that rtnetlink requests do not pass additional properties * potentially referring to different network namespaces. */ static int rtnl_ensure_unique_netns(struct nlattr *tb[], struct netlink_ext_ack *extack, bool netns_id_only) { if (netns_id_only) { if (!tb[IFLA_NET_NS_PID] && !tb[IFLA_NET_NS_FD]) return 0; NL_SET_ERR_MSG(extack, "specified netns attribute not supported"); return -EOPNOTSUPP; } if (tb[IFLA_TARGET_NETNSID] && (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD])) goto invalid_attr; if (tb[IFLA_NET_NS_PID] && (tb[IFLA_TARGET_NETNSID] || tb[IFLA_NET_NS_FD])) goto invalid_attr; if (tb[IFLA_NET_NS_FD] && (tb[IFLA_TARGET_NETNSID] || tb[IFLA_NET_NS_PID])) goto invalid_attr; return 0; invalid_attr: NL_SET_ERR_MSG(extack, "multiple netns identifying attributes specified"); return -EINVAL; } static int rtnl_set_vf_rate(struct net_device *dev, int vf, int min_tx_rate, int max_tx_rate) { const struct net_device_ops *ops = dev->netdev_ops; if (!ops->ndo_set_vf_rate) return -EOPNOTSUPP; if (max_tx_rate && max_tx_rate < min_tx_rate) return -EINVAL; return ops->ndo_set_vf_rate(dev, vf, min_tx_rate, max_tx_rate); } static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS] && nla_len(tb[IFLA_ADDRESS]) < dev->addr_len) return -EINVAL; if (tb[IFLA_BROADCAST] && nla_len(tb[IFLA_BROADCAST]) < dev->addr_len) return -EINVAL; if (tb[IFLA_GSO_MAX_SIZE] && nla_get_u32(tb[IFLA_GSO_MAX_SIZE]) > dev->tso_max_size) { NL_SET_ERR_MSG(extack, "too big gso_max_size"); return -EINVAL; } if (tb[IFLA_GSO_MAX_SEGS] && (nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > GSO_MAX_SEGS || nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > dev->tso_max_segs)) { NL_SET_ERR_MSG(extack, "too big gso_max_segs"); return -EINVAL; } if (tb[IFLA_GRO_MAX_SIZE] && nla_get_u32(tb[IFLA_GRO_MAX_SIZE]) > GRO_MAX_SIZE) { NL_SET_ERR_MSG(extack, "too big gro_max_size"); return -EINVAL; } if (tb[IFLA_GSO_IPV4_MAX_SIZE] && nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]) > dev->tso_max_size) { NL_SET_ERR_MSG(extack, "too big gso_ipv4_max_size"); return -EINVAL; } if (tb[IFLA_GRO_IPV4_MAX_SIZE] && nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE]) > GRO_MAX_SIZE) { NL_SET_ERR_MSG(extack, "too big gro_ipv4_max_size"); return -EINVAL; } if (tb[IFLA_AF_SPEC]) { struct nlattr *af; int rem, err; nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { struct rtnl_af_ops *af_ops; int af_ops_srcu_index; af_ops = rtnl_af_lookup(nla_type(af), &af_ops_srcu_index); if (!af_ops) return -EAFNOSUPPORT; if (!af_ops->set_link_af) err = -EOPNOTSUPP; else if (af_ops->validate_link_af) err = af_ops->validate_link_af(dev, af, extack); else err = 0; rtnl_af_put(af_ops, af_ops_srcu_index); if (err < 0) return err; } } return 0; } static int handle_infiniband_guid(struct net_device *dev, struct ifla_vf_guid *ivt, int guid_type) { const struct net_device_ops *ops = dev->netdev_ops; return ops->ndo_set_vf_guid(dev, ivt->vf, ivt->guid, guid_type); } static int handle_vf_guid(struct net_device *dev, struct ifla_vf_guid *ivt, int guid_type) { if (dev->type != ARPHRD_INFINIBAND) return -EOPNOTSUPP; return handle_infiniband_guid(dev, ivt, guid_type); } static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) { const struct net_device_ops *ops = dev->netdev_ops; int err = -EINVAL; if (tb[IFLA_VF_MAC]) { struct ifla_vf_mac *ivm = nla_data(tb[IFLA_VF_MAC]); if (ivm->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_mac) err = ops->ndo_set_vf_mac(dev, ivm->vf, ivm->mac); if (err < 0) return err; } if (tb[IFLA_VF_VLAN]) { struct ifla_vf_vlan *ivv = nla_data(tb[IFLA_VF_VLAN]); if (ivv->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_vlan) err = ops->ndo_set_vf_vlan(dev, ivv->vf, ivv->vlan, ivv->qos, htons(ETH_P_8021Q)); if (err < 0) return err; } if (tb[IFLA_VF_VLAN_LIST]) { struct ifla_vf_vlan_info *ivvl[MAX_VLAN_LIST_LEN]; struct nlattr *attr; int rem, len = 0; err = -EOPNOTSUPP; if (!ops->ndo_set_vf_vlan) return err; nla_for_each_nested(attr, tb[IFLA_VF_VLAN_LIST], rem) { if (nla_type(attr) != IFLA_VF_VLAN_INFO || nla_len(attr) < sizeof(struct ifla_vf_vlan_info)) { return -EINVAL; } if (len >= MAX_VLAN_LIST_LEN) return -EOPNOTSUPP; ivvl[len] = nla_data(attr); len++; } if (len == 0) return -EINVAL; if (ivvl[0]->vf >= INT_MAX) return -EINVAL; err = ops->ndo_set_vf_vlan(dev, ivvl[0]->vf, ivvl[0]->vlan, ivvl[0]->qos, ivvl[0]->vlan_proto); if (err < 0) return err; } if (tb[IFLA_VF_TX_RATE]) { struct ifla_vf_tx_rate *ivt = nla_data(tb[IFLA_VF_TX_RATE]); struct ifla_vf_info ivf; if (ivt->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_get_vf_config) err = ops->ndo_get_vf_config(dev, ivt->vf, &ivf); if (err < 0) return err; err = rtnl_set_vf_rate(dev, ivt->vf, ivf.min_tx_rate, ivt->rate); if (err < 0) return err; } if (tb[IFLA_VF_RATE]) { struct ifla_vf_rate *ivt = nla_data(tb[IFLA_VF_RATE]); if (ivt->vf >= INT_MAX) return -EINVAL; err = rtnl_set_vf_rate(dev, ivt->vf, ivt->min_tx_rate, ivt->max_tx_rate); if (err < 0) return err; } if (tb[IFLA_VF_SPOOFCHK]) { struct ifla_vf_spoofchk *ivs = nla_data(tb[IFLA_VF_SPOOFCHK]); if (ivs->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_spoofchk) err = ops->ndo_set_vf_spoofchk(dev, ivs->vf, ivs->setting); if (err < 0) return err; } if (tb[IFLA_VF_LINK_STATE]) { struct ifla_vf_link_state *ivl = nla_data(tb[IFLA_VF_LINK_STATE]); if (ivl->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_link_state) err = ops->ndo_set_vf_link_state(dev, ivl->vf, ivl->link_state); if (err < 0) return err; } if (tb[IFLA_VF_RSS_QUERY_EN]) { struct ifla_vf_rss_query_en *ivrssq_en; err = -EOPNOTSUPP; ivrssq_en = nla_data(tb[IFLA_VF_RSS_QUERY_EN]); if (ivrssq_en->vf >= INT_MAX) return -EINVAL; if (ops->ndo_set_vf_rss_query_en) err = ops->ndo_set_vf_rss_query_en(dev, ivrssq_en->vf, ivrssq_en->setting); if (err < 0) return err; } if (tb[IFLA_VF_TRUST]) { struct ifla_vf_trust *ivt = nla_data(tb[IFLA_VF_TRUST]); if (ivt->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_trust) err = ops->ndo_set_vf_trust(dev, ivt->vf, ivt->setting); if (err < 0) return err; } if (tb[IFLA_VF_IB_NODE_GUID]) { struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_NODE_GUID]); if (ivt->vf >= INT_MAX) return -EINVAL; if (!ops->ndo_set_vf_guid) return -EOPNOTSUPP; return handle_vf_guid(dev, ivt, IFLA_VF_IB_NODE_GUID); } if (tb[IFLA_VF_IB_PORT_GUID]) { struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_PORT_GUID]); if (ivt->vf >= INT_MAX) return -EINVAL; if (!ops->ndo_set_vf_guid) return -EOPNOTSUPP; return handle_vf_guid(dev, ivt, IFLA_VF_IB_PORT_GUID); } return err; } static int do_set_master(struct net_device *dev, int ifindex, struct netlink_ext_ack *extack) { struct net_device *upper_dev = netdev_master_upper_dev_get(dev); const struct net_device_ops *ops; int err; /* Release the lower lock, the upper is responsible for locking * the lower if needed. None of the existing upper devices * use netdev instance lock, so don't grab it. */ if (upper_dev) { if (upper_dev->ifindex == ifindex) return 0; ops = upper_dev->netdev_ops; if (ops->ndo_del_slave) { netdev_unlock_ops(dev); err = ops->ndo_del_slave(upper_dev, dev); netdev_lock_ops(dev); if (err) return err; } else { return -EOPNOTSUPP; } } if (ifindex) { upper_dev = __dev_get_by_index(dev_net(dev), ifindex); if (!upper_dev) return -EINVAL; ops = upper_dev->netdev_ops; if (ops->ndo_add_slave) { netdev_unlock_ops(dev); err = ops->ndo_add_slave(upper_dev, dev, extack); netdev_lock_ops(dev); if (err) return err; } else { return -EOPNOTSUPP; } } return 0; } static const struct nla_policy ifla_proto_down_reason_policy[IFLA_PROTO_DOWN_REASON_VALUE + 1] = { [IFLA_PROTO_DOWN_REASON_MASK] = { .type = NLA_U32 }, [IFLA_PROTO_DOWN_REASON_VALUE] = { .type = NLA_U32 }, }; static int do_set_proto_down(struct net_device *dev, struct nlattr *nl_proto_down, struct nlattr *nl_proto_down_reason, struct netlink_ext_ack *extack) { struct nlattr *pdreason[IFLA_PROTO_DOWN_REASON_MAX + 1]; unsigned long mask = 0; u32 value; bool proto_down; int err; if (!dev->change_proto_down) { NL_SET_ERR_MSG(extack, "Protodown not supported by device"); return -EOPNOTSUPP; } if (nl_proto_down_reason) { err = nla_parse_nested_deprecated(pdreason, IFLA_PROTO_DOWN_REASON_MAX, nl_proto_down_reason, ifla_proto_down_reason_policy, NULL); if (err < 0) return err; if (!pdreason[IFLA_PROTO_DOWN_REASON_VALUE]) { NL_SET_ERR_MSG(extack, "Invalid protodown reason value"); return -EINVAL; } value = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_VALUE]); if (pdreason[IFLA_PROTO_DOWN_REASON_MASK]) mask = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_MASK]); netdev_change_proto_down_reason_locked(dev, mask, value); } if (nl_proto_down) { proto_down = nla_get_u8(nl_proto_down); /* Don't turn off protodown if there are active reasons */ if (!proto_down && dev->proto_down_reason) { NL_SET_ERR_MSG(extack, "Cannot clear protodown, active reasons"); return -EBUSY; } err = netif_change_proto_down(dev, proto_down); if (err) return err; } return 0; } #define DO_SETLINK_MODIFIED 0x01 /* notify flag means notify + modified. */ #define DO_SETLINK_NOTIFY 0x03 static int do_setlink(const struct sk_buff *skb, struct net_device *dev, struct net *tgt_net, struct ifinfomsg *ifm, struct netlink_ext_ack *extack, struct nlattr **tb, int status) { const struct net_device_ops *ops = dev->netdev_ops; char ifname[IFNAMSIZ]; int err; err = validate_linkmsg(dev, tb, extack); if (err < 0) return err; if (tb[IFLA_IFNAME]) nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); else ifname[0] = '\0'; if (!net_eq(tgt_net, dev_net(dev))) { const char *pat = ifname[0] ? ifname : NULL; int new_ifindex; new_ifindex = nla_get_s32_default(tb[IFLA_NEW_IFINDEX], 0); err = __dev_change_net_namespace(dev, tgt_net, pat, new_ifindex, extack); if (err) return err; status |= DO_SETLINK_MODIFIED; } netdev_lock_ops(dev); if (tb[IFLA_MAP]) { struct rtnl_link_ifmap *u_map; struct ifmap k_map; if (!ops->ndo_set_config) { err = -EOPNOTSUPP; goto errout; } if (!netif_device_present(dev)) { err = -ENODEV; goto errout; } u_map = nla_data(tb[IFLA_MAP]); k_map.mem_start = (unsigned long) u_map->mem_start; k_map.mem_end = (unsigned long) u_map->mem_end; k_map.base_addr = (unsigned short) u_map->base_addr; k_map.irq = (unsigned char) u_map->irq; k_map.dma = (unsigned char) u_map->dma; k_map.port = (unsigned char) u_map->port; err = ops->ndo_set_config(dev, &k_map); if (err < 0) goto errout; status |= DO_SETLINK_NOTIFY; } if (tb[IFLA_ADDRESS]) { struct sockaddr *sa; int len; len = sizeof(sa_family_t) + max_t(size_t, dev->addr_len, sizeof(*sa)); sa = kmalloc(len, GFP_KERNEL); if (!sa) { err = -ENOMEM; goto errout; } sa->sa_family = dev->type; netdev_unlock_ops(dev); /* dev_addr_sem is an outer lock, enforce proper ordering */ down_write(&dev_addr_sem); netdev_lock_ops(dev); memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]), dev->addr_len); err = netif_set_mac_address(dev, sa, extack); kfree(sa); if (err) { up_write(&dev_addr_sem); goto errout; } status |= DO_SETLINK_MODIFIED; up_write(&dev_addr_sem); } if (tb[IFLA_MTU]) { err = netif_set_mtu_ext(dev, nla_get_u32(tb[IFLA_MTU]), extack); if (err < 0) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_GROUP]) { netif_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); status |= DO_SETLINK_NOTIFY; } /* * Interface selected by interface index but interface * name provided implies that a name change has been * requested. */ if (ifm->ifi_index > 0 && ifname[0]) { err = netif_change_name(dev, ifname); if (err < 0) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_IFALIAS]) { err = netif_set_alias(dev, nla_data(tb[IFLA_IFALIAS]), nla_len(tb[IFLA_IFALIAS])); if (err < 0) goto errout; status |= DO_SETLINK_NOTIFY; } if (tb[IFLA_BROADCAST]) { nla_memcpy(dev->broadcast, tb[IFLA_BROADCAST], dev->addr_len); call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); } if (ifm->ifi_flags || ifm->ifi_change) { err = netif_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), extack); if (err < 0) goto errout; } if (tb[IFLA_MASTER]) { err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); if (err) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_CARRIER]) { err = netif_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER])); if (err) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_TXQLEN]) { unsigned int value = nla_get_u32(tb[IFLA_TXQLEN]); err = netif_change_tx_queue_len(dev, value); if (err) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_GSO_MAX_SIZE]) { u32 max_size = nla_get_u32(tb[IFLA_GSO_MAX_SIZE]); if (dev->gso_max_size ^ max_size) { netif_set_gso_max_size(dev, max_size); status |= DO_SETLINK_MODIFIED; } } if (tb[IFLA_GSO_MAX_SEGS]) { u32 max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]); if (dev->gso_max_segs ^ max_segs) { netif_set_gso_max_segs(dev, max_segs); status |= DO_SETLINK_MODIFIED; } } if (tb[IFLA_GRO_MAX_SIZE]) { u32 gro_max_size = nla_get_u32(tb[IFLA_GRO_MAX_SIZE]); if (dev->gro_max_size ^ gro_max_size) { netif_set_gro_max_size(dev, gro_max_size); status |= DO_SETLINK_MODIFIED; } } if (tb[IFLA_GSO_IPV4_MAX_SIZE]) { u32 max_size = nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]); if (dev->gso_ipv4_max_size ^ max_size) { netif_set_gso_ipv4_max_size(dev, max_size); status |= DO_SETLINK_MODIFIED; } } if (tb[IFLA_GRO_IPV4_MAX_SIZE]) { u32 gro_max_size = nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE]); if (dev->gro_ipv4_max_size ^ gro_max_size) { netif_set_gro_ipv4_max_size(dev, gro_max_size); status |= DO_SETLINK_MODIFIED; } } if (tb[IFLA_OPERSTATE]) set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); if (tb[IFLA_LINKMODE]) { unsigned char value = nla_get_u8(tb[IFLA_LINKMODE]); if (dev->link_mode ^ value) status |= DO_SETLINK_NOTIFY; WRITE_ONCE(dev->link_mode, value); } if (tb[IFLA_VFINFO_LIST]) { struct nlattr *vfinfo[IFLA_VF_MAX + 1]; struct nlattr *attr; int rem; nla_for_each_nested(attr, tb[IFLA_VFINFO_LIST], rem) { if (nla_type(attr) != IFLA_VF_INFO || nla_len(attr) < NLA_HDRLEN) { err = -EINVAL; goto errout; } err = nla_parse_nested_deprecated(vfinfo, IFLA_VF_MAX, attr, ifla_vf_policy, NULL); if (err < 0) goto errout; err = do_setvfinfo(dev, vfinfo); if (err < 0) goto errout; status |= DO_SETLINK_NOTIFY; } } err = 0; if (tb[IFLA_VF_PORTS]) { struct nlattr *port[IFLA_PORT_MAX+1]; struct nlattr *attr; int vf; int rem; err = -EOPNOTSUPP; if (!ops->ndo_set_vf_port) goto errout; nla_for_each_nested(attr, tb[IFLA_VF_PORTS], rem) { if (nla_type(attr) != IFLA_VF_PORT || nla_len(attr) < NLA_HDRLEN) { err = -EINVAL; goto errout; } err = nla_parse_nested_deprecated(port, IFLA_PORT_MAX, attr, ifla_port_policy, NULL); if (err < 0) goto errout; if (!port[IFLA_PORT_VF]) { err = -EOPNOTSUPP; goto errout; } vf = nla_get_u32(port[IFLA_PORT_VF]); err = ops->ndo_set_vf_port(dev, vf, port); if (err < 0) goto errout; status |= DO_SETLINK_NOTIFY; } } err = 0; if (tb[IFLA_PORT_SELF]) { struct nlattr *port[IFLA_PORT_MAX+1]; err = nla_parse_nested_deprecated(port, IFLA_PORT_MAX, tb[IFLA_PORT_SELF], ifla_port_policy, NULL); if (err < 0) goto errout; err = -EOPNOTSUPP; if (ops->ndo_set_vf_port) err = ops->ndo_set_vf_port(dev, PORT_SELF_VF, port); if (err < 0) goto errout; status |= DO_SETLINK_NOTIFY; } if (tb[IFLA_AF_SPEC]) { struct nlattr *af; int rem; nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { struct rtnl_af_ops *af_ops; int af_ops_srcu_index; af_ops = rtnl_af_lookup(nla_type(af), &af_ops_srcu_index); if (!af_ops) { err = -EAFNOSUPPORT; goto errout; } err = af_ops->set_link_af(dev, af, extack); rtnl_af_put(af_ops, af_ops_srcu_index); if (err < 0) goto errout; status |= DO_SETLINK_NOTIFY; } } err = 0; if (tb[IFLA_PROTO_DOWN] || tb[IFLA_PROTO_DOWN_REASON]) { err = do_set_proto_down(dev, tb[IFLA_PROTO_DOWN], tb[IFLA_PROTO_DOWN_REASON], extack); if (err) goto errout; status |= DO_SETLINK_NOTIFY; } if (tb[IFLA_XDP]) { struct nlattr *xdp[IFLA_XDP_MAX + 1]; u32 xdp_flags = 0; err = nla_parse_nested_deprecated(xdp, IFLA_XDP_MAX, tb[IFLA_XDP], ifla_xdp_policy, NULL); if (err < 0) goto errout; if (xdp[IFLA_XDP_ATTACHED] || xdp[IFLA_XDP_PROG_ID]) { err = -EINVAL; goto errout; } if (xdp[IFLA_XDP_FLAGS]) { xdp_flags = nla_get_u32(xdp[IFLA_XDP_FLAGS]); if (xdp_flags & ~XDP_FLAGS_MASK) { err = -EINVAL; goto errout; } if (hweight32(xdp_flags & XDP_FLAGS_MODES) > 1) { err = -EINVAL; goto errout; } } if (xdp[IFLA_XDP_FD]) { int expected_fd = -1; if (xdp_flags & XDP_FLAGS_REPLACE) { if (!xdp[IFLA_XDP_EXPECTED_FD]) { err = -EINVAL; goto errout; } expected_fd = nla_get_s32(xdp[IFLA_XDP_EXPECTED_FD]); } err = dev_change_xdp_fd(dev, extack, nla_get_s32(xdp[IFLA_XDP_FD]), expected_fd, xdp_flags); if (err) goto errout; status |= DO_SETLINK_NOTIFY; } } errout: if (status & DO_SETLINK_MODIFIED) { if ((status & DO_SETLINK_NOTIFY) == DO_SETLINK_NOTIFY) netif_state_change(dev); if (err < 0) net_warn_ratelimited("A link change request failed with some changes committed already. Interface %s may have been left with an inconsistent configuration, please check.\n", dev->name); } netdev_unlock_ops(dev); return err; } static struct net_device *rtnl_dev_get(struct net *net, struct nlattr *tb[]) { char ifname[ALTIFNAMSIZ]; if (tb[IFLA_IFNAME]) nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); else if (tb[IFLA_ALT_IFNAME]) nla_strscpy(ifname, tb[IFLA_ALT_IFNAME], ALTIFNAMSIZ); else return NULL; return __dev_get_by_name(net, ifname); } static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct ifinfomsg *ifm = nlmsg_data(nlh); struct net *net = sock_net(skb->sk); struct nlattr *tb[IFLA_MAX+1]; struct net_device *dev = NULL; struct rtnl_nets rtnl_nets; struct net *tgt_net; int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); if (err < 0) goto errout; err = rtnl_ensure_unique_netns(tb, extack, false); if (err < 0) goto errout; tgt_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN); if (IS_ERR(tgt_net)) { err = PTR_ERR(tgt_net); goto errout; } rtnl_nets_init(&rtnl_nets); rtnl_nets_add(&rtnl_nets, get_net(net)); rtnl_nets_add(&rtnl_nets, tgt_net); rtnl_nets_lock(&rtnl_nets); if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) dev = rtnl_dev_get(net, tb); else err = -EINVAL; if (dev) err = do_setlink(skb, dev, tgt_net, ifm, extack, tb, 0); else if (!err) err = -ENODEV; rtnl_nets_unlock(&rtnl_nets); rtnl_nets_destroy(&rtnl_nets); errout: return err; } static int rtnl_group_dellink(const struct net *net, int group) { struct net_device *dev, *aux; LIST_HEAD(list_kill); bool found = false; if (!group) return -EPERM; for_each_netdev(net, dev) { if (dev->group == group) { const struct rtnl_link_ops *ops; found = true; ops = dev->rtnl_link_ops; if (!ops || !ops->dellink) return -EOPNOTSUPP; } } if (!found) return -ENODEV; for_each_netdev_safe(net, dev, aux) { if (dev->group == group) { const struct rtnl_link_ops *ops; ops = dev->rtnl_link_ops; ops->dellink(dev, &list_kill); } } unregister_netdevice_many(&list_kill); return 0; } int rtnl_delete_link(struct net_device *dev, u32 portid, const struct nlmsghdr *nlh) { const struct rtnl_link_ops *ops; LIST_HEAD(list_kill); ops = dev->rtnl_link_ops; if (!ops || !ops->dellink) return -EOPNOTSUPP; ops->dellink(dev, &list_kill); unregister_netdevice_many_notify(&list_kill, portid, nlh); return 0; } EXPORT_SYMBOL_GPL(rtnl_delete_link); static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct ifinfomsg *ifm = nlmsg_data(nlh); struct net *net = sock_net(skb->sk); u32 portid = NETLINK_CB(skb).portid; struct nlattr *tb[IFLA_MAX+1]; struct net_device *dev = NULL; struct net *tgt_net = net; int netnsid = -1; int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); if (err < 0) return err; err = rtnl_ensure_unique_netns(tb, extack, true); if (err < 0) return err; if (tb[IFLA_TARGET_NETNSID]) { netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]); tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid); if (IS_ERR(tgt_net)) return PTR_ERR(tgt_net); } rtnl_net_lock(tgt_net); if (ifm->ifi_index > 0) dev = __dev_get_by_index(tgt_net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) dev = rtnl_dev_get(tgt_net, tb); if (dev) err = rtnl_delete_link(dev, portid, nlh); else if (ifm->ifi_index > 0 || tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) err = -ENODEV; else if (tb[IFLA_GROUP]) err = rtnl_group_dellink(tgt_net, nla_get_u32(tb[IFLA_GROUP])); else err = -EINVAL; rtnl_net_unlock(tgt_net); if (netnsid >= 0) put_net(tgt_net); return err; } int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm, u32 portid, const struct nlmsghdr *nlh) { unsigned int old_flags, changed; int err; old_flags = dev->flags; if (ifm && (ifm->ifi_flags || ifm->ifi_change)) { err = __dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), NULL); if (err < 0) return err; } changed = old_flags ^ dev->flags; if (dev->rtnl_link_initializing) { dev->rtnl_link_initializing = false; changed = ~0U; } __dev_notify_flags(dev, old_flags, changed, portid, nlh); return 0; } EXPORT_SYMBOL(rtnl_configure_link); struct net_device *rtnl_create_link(struct net *net, const char *ifname, unsigned char name_assign_type, const struct rtnl_link_ops *ops, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct net_device *dev; unsigned int num_tx_queues = 1; unsigned int num_rx_queues = 1; int err; if (tb[IFLA_NUM_TX_QUEUES]) num_tx_queues = nla_get_u32(tb[IFLA_NUM_TX_QUEUES]); else if (ops->get_num_tx_queues) num_tx_queues = ops->get_num_tx_queues(); if (tb[IFLA_NUM_RX_QUEUES]) num_rx_queues = nla_get_u32(tb[IFLA_NUM_RX_QUEUES]); else if (ops->get_num_rx_queues) num_rx_queues = ops->get_num_rx_queues(); if (num_tx_queues < 1 || num_tx_queues > 4096) { NL_SET_ERR_MSG(extack, "Invalid number of transmit queues"); return ERR_PTR(-EINVAL); } if (num_rx_queues < 1 || num_rx_queues > 4096) { NL_SET_ERR_MSG(extack, "Invalid number of receive queues"); return ERR_PTR(-EINVAL); } if (ops->alloc) { dev = ops->alloc(tb, ifname, name_assign_type, num_tx_queues, num_rx_queues); if (IS_ERR(dev)) return dev; } else { dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type, ops->setup, num_tx_queues, num_rx_queues); } if (!dev) return ERR_PTR(-ENOMEM); err = validate_linkmsg(dev, tb, extack); if (err < 0) { free_netdev(dev); return ERR_PTR(err); } dev_net_set(dev, net); dev->rtnl_link_ops = ops; dev->rtnl_link_initializing = true; if (tb[IFLA_MTU]) { u32 mtu = nla_get_u32(tb[IFLA_MTU]); err = dev_validate_mtu(dev, mtu, extack); if (err) { free_netdev(dev); return ERR_PTR(err); } dev->mtu = mtu; } if (tb[IFLA_ADDRESS]) { __dev_addr_set(dev, nla_data(tb[IFLA_ADDRESS]), nla_len(tb[IFLA_ADDRESS])); dev->addr_assign_type = NET_ADDR_SET; } if (tb[IFLA_BROADCAST]) memcpy(dev->broadcast, nla_data(tb[IFLA_BROADCAST]), nla_len(tb[IFLA_BROADCAST])); if (tb[IFLA_TXQLEN]) dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]); if (tb[IFLA_OPERSTATE]) set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); if (tb[IFLA_LINKMODE]) dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]); if (tb[IFLA_GROUP]) dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); if (tb[IFLA_GSO_MAX_SIZE]) netif_set_gso_max_size(dev, nla_get_u32(tb[IFLA_GSO_MAX_SIZE])); if (tb[IFLA_GSO_MAX_SEGS]) netif_set_gso_max_segs(dev, nla_get_u32(tb[IFLA_GSO_MAX_SEGS])); if (tb[IFLA_GRO_MAX_SIZE]) netif_set_gro_max_size(dev, nla_get_u32(tb[IFLA_GRO_MAX_SIZE])); if (tb[IFLA_GSO_IPV4_MAX_SIZE]) netif_set_gso_ipv4_max_size(dev, nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE])); if (tb[IFLA_GRO_IPV4_MAX_SIZE]) netif_set_gro_ipv4_max_size(dev, nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE])); return dev; } EXPORT_SYMBOL(rtnl_create_link); struct rtnl_newlink_tbs { struct nlattr *tb[IFLA_MAX + 1]; struct nlattr *linkinfo[IFLA_INFO_MAX + 1]; struct nlattr *attr[RTNL_MAX_TYPE + 1]; struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; }; static int rtnl_changelink(const struct sk_buff *skb, struct nlmsghdr *nlh, const struct rtnl_link_ops *ops, struct net_device *dev, struct net *tgt_net, struct rtnl_newlink_tbs *tbs, struct nlattr **data, struct netlink_ext_ack *extack) { struct nlattr ** const linkinfo = tbs->linkinfo; struct nlattr ** const tb = tbs->tb; int status = 0; int err; if (nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; if (nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; if (linkinfo[IFLA_INFO_DATA]) { if (!ops || ops != dev->rtnl_link_ops || !ops->changelink) return -EOPNOTSUPP; err = ops->changelink(dev, tb, data, extack); if (err < 0) return err; status |= DO_SETLINK_NOTIFY; } if (linkinfo[IFLA_INFO_SLAVE_DATA]) { const struct rtnl_link_ops *m_ops = NULL; struct nlattr **slave_data = NULL; struct net_device *master_dev; master_dev = netdev_master_upper_dev_get(dev); if (master_dev) m_ops = master_dev->rtnl_link_ops; if (!m_ops || !m_ops->slave_changelink) return -EOPNOTSUPP; if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE) return -EINVAL; if (m_ops->slave_maxtype) { err = nla_parse_nested_deprecated(tbs->slave_attr, m_ops->slave_maxtype, linkinfo[IFLA_INFO_SLAVE_DATA], m_ops->slave_policy, extack); if (err < 0) return err; slave_data = tbs->slave_attr; } err = m_ops->slave_changelink(master_dev, dev, tb, slave_data, extack); if (err < 0) return err; status |= DO_SETLINK_NOTIFY; } return do_setlink(skb, dev, tgt_net, nlmsg_data(nlh), extack, tb, status); } static int rtnl_group_changelink(const struct sk_buff *skb, struct net *net, struct net *tgt_net, int group, struct ifinfomsg *ifm, struct netlink_ext_ack *extack, struct nlattr **tb) { struct net_device *dev, *aux; int err; for_each_netdev_safe(net, dev, aux) { if (dev->group == group) { err = do_setlink(skb, dev, tgt_net, ifm, extack, tb, 0); if (err < 0) return err; } } return 0; } static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, const struct rtnl_link_ops *ops, struct net *tgt_net, struct net *link_net, struct net *peer_net, const struct nlmsghdr *nlh, struct nlattr **tb, struct nlattr **data, struct netlink_ext_ack *extack) { unsigned char name_assign_type = NET_NAME_USER; struct rtnl_newlink_params params = { .src_net = sock_net(skb->sk), .link_net = link_net, .peer_net = peer_net, .tb = tb, .data = data, }; u32 portid = NETLINK_CB(skb).portid; struct net_device *dev; char ifname[IFNAMSIZ]; int err; if (!ops->alloc && !ops->setup) return -EOPNOTSUPP; if (tb[IFLA_IFNAME]) { nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); } else { snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); name_assign_type = NET_NAME_ENUM; } dev = rtnl_create_link(tgt_net, ifname, name_assign_type, ops, tb, extack); if (IS_ERR(dev)) { err = PTR_ERR(dev); goto out; } dev->ifindex = ifm->ifi_index; if (ops->newlink) err = ops->newlink(dev, &params, extack); else err = register_netdevice(dev); if (err < 0) { free_netdev(dev); goto out; } netdev_lock_ops(dev); err = rtnl_configure_link(dev, ifm, portid, nlh); if (err < 0) goto out_unregister; if (tb[IFLA_MASTER]) { err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); if (err) goto out_unregister; } netdev_unlock_ops(dev); out: return err; out_unregister: netdev_unlock_ops(dev); if (ops->newlink) { LIST_HEAD(list_kill); ops->dellink(dev, &list_kill); unregister_netdevice_many(&list_kill); } else { unregister_netdevice(dev); } goto out; } static struct net *rtnl_get_peer_net(const struct rtnl_link_ops *ops, struct nlattr *tbp[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_MAX + 1]; int err; if (!data || !data[ops->peer_type]) return rtnl_link_get_net_ifla(tbp); err = rtnl_nla_parse_ifinfomsg(tb, data[ops->peer_type], extack); if (err < 0) return ERR_PTR(err); if (ops->validate) { err = ops->validate(tb, NULL, extack); if (err < 0) return ERR_PTR(err); } return rtnl_link_get_net_ifla(tb); } static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, const struct rtnl_link_ops *ops, struct net *tgt_net, struct net *link_net, struct net *peer_net, struct rtnl_newlink_tbs *tbs, struct nlattr **data, struct netlink_ext_ack *extack) { struct nlattr ** const tb = tbs->tb; struct net *net = sock_net(skb->sk); struct net *device_net; struct net_device *dev; struct ifinfomsg *ifm; bool link_specified; /* When creating, lookup for existing device in target net namespace */ device_net = (nlh->nlmsg_flags & NLM_F_CREATE) && (nlh->nlmsg_flags & NLM_F_EXCL) ? tgt_net : net; ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) { link_specified = true; dev = __dev_get_by_index(device_net, ifm->ifi_index); } else if (ifm->ifi_index < 0) { NL_SET_ERR_MSG(extack, "ifindex can't be negative"); return -EINVAL; } else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) { link_specified = true; dev = rtnl_dev_get(device_net, tb); } else { link_specified = false; dev = NULL; } if (dev) return rtnl_changelink(skb, nlh, ops, dev, tgt_net, tbs, data, extack); if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { /* No dev found and NLM_F_CREATE not set. Requested dev does not exist, * or it's for a group */ if (link_specified || !tb[IFLA_GROUP]) return -ENODEV; return rtnl_group_changelink(skb, net, tgt_net, nla_get_u32(tb[IFLA_GROUP]), ifm, extack, tb); } if (tb[IFLA_MAP] || tb[IFLA_PROTINFO]) return -EOPNOTSUPP; if (!ops) { NL_SET_ERR_MSG(extack, "Unknown device type"); return -EOPNOTSUPP; } return rtnl_newlink_create(skb, ifm, ops, tgt_net, link_net, peer_net, nlh, tb, data, extack); } static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *tgt_net, *link_net = NULL, *peer_net = NULL; struct nlattr **tb, **linkinfo, **data = NULL; struct rtnl_link_ops *ops = NULL; struct rtnl_newlink_tbs *tbs; struct rtnl_nets rtnl_nets; int ops_srcu_index; int ret; tbs = kmalloc(sizeof(*tbs), GFP_KERNEL); if (!tbs) return -ENOMEM; tb = tbs->tb; ret = nlmsg_parse_deprecated(nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, ifla_policy, extack); if (ret < 0) goto free; ret = rtnl_ensure_unique_netns(tb, extack, false); if (ret < 0) goto free; linkinfo = tbs->linkinfo; if (tb[IFLA_LINKINFO]) { ret = nla_parse_nested_deprecated(linkinfo, IFLA_INFO_MAX, tb[IFLA_LINKINFO], ifla_info_policy, NULL); if (ret < 0) goto free; } else { memset(linkinfo, 0, sizeof(tbs->linkinfo)); } if (linkinfo[IFLA_INFO_KIND]) { char kind[MODULE_NAME_LEN]; nla_strscpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind)); ops = rtnl_link_ops_get(kind, &ops_srcu_index); #ifdef CONFIG_MODULES if (!ops) { request_module("rtnl-link-%s", kind); ops = rtnl_link_ops_get(kind, &ops_srcu_index); } #endif } rtnl_nets_init(&rtnl_nets); if (ops) { if (ops->maxtype > RTNL_MAX_TYPE) { ret = -EINVAL; goto put_ops; } if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { ret = nla_parse_nested_deprecated(tbs->attr, ops->maxtype, linkinfo[IFLA_INFO_DATA], ops->policy, extack); if (ret < 0) goto put_ops; data = tbs->attr; } if (ops->validate) { ret = ops->validate(tb, data, extack); if (ret < 0) goto put_ops; } if (ops->peer_type) { peer_net = rtnl_get_peer_net(ops, tb, data, extack); if (IS_ERR(peer_net)) { ret = PTR_ERR(peer_net); goto put_ops; } if (peer_net) rtnl_nets_add(&rtnl_nets, peer_net); } } tgt_net = rtnl_link_get_net_capable(skb, sock_net(skb->sk), tb, CAP_NET_ADMIN); if (IS_ERR(tgt_net)) { ret = PTR_ERR(tgt_net); goto put_net; } rtnl_nets_add(&rtnl_nets, tgt_net); if (tb[IFLA_LINK_NETNSID]) { int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); link_net = get_net_ns_by_id(tgt_net, id); if (!link_net) { NL_SET_ERR_MSG(extack, "Unknown network namespace id"); ret = -EINVAL; goto put_net; } rtnl_nets_add(&rtnl_nets, link_net); if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; goto put_net; } } rtnl_nets_lock(&rtnl_nets); ret = __rtnl_newlink(skb, nlh, ops, tgt_net, link_net, peer_net, tbs, data, extack); rtnl_nets_unlock(&rtnl_nets); put_net: rtnl_nets_destroy(&rtnl_nets); put_ops: if (ops) rtnl_link_ops_put(ops, ops_srcu_index); free: kfree(tbs); return ret; } static int rtnl_valid_getlink_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct ifinfomsg *ifm; int i, err; ifm = nlmsg_payload(nlh, sizeof(*ifm)); if (!ifm) { NL_SET_ERR_MSG(extack, "Invalid header for get link"); return -EINVAL; } if (!netlink_strict_get_check(skb)) return nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change) { NL_SET_ERR_MSG(extack, "Invalid values in header for get link request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); if (err) return err; for (i = 0; i <= IFLA_MAX; i++) { if (!tb[i]) continue; switch (i) { case IFLA_IFNAME: case IFLA_ALT_IFNAME: case IFLA_EXT_MASK: case IFLA_TARGET_NETNSID: break; default: NL_SET_ERR_MSG(extack, "Unsupported attribute in get link request"); return -EINVAL; } } return 0; } static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct net *tgt_net = net; struct ifinfomsg *ifm; struct nlattr *tb[IFLA_MAX+1]; struct net_device *dev = NULL; struct sk_buff *nskb; int netnsid = -1; int err; u32 ext_filter_mask = 0; err = rtnl_valid_getlink_req(skb, nlh, tb, extack); if (err < 0) return err; err = rtnl_ensure_unique_netns(tb, extack, true); if (err < 0) return err; if (tb[IFLA_TARGET_NETNSID]) { netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]); tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid); if (IS_ERR(tgt_net)) return PTR_ERR(tgt_net); } if (tb[IFLA_EXT_MASK]) ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); err = -EINVAL; ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(tgt_net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) dev = rtnl_dev_get(tgt_net, tb); else goto out; err = -ENODEV; if (dev == NULL) goto out; err = -ENOBUFS; nskb = nlmsg_new_large(if_nlmsg_size(dev, ext_filter_mask)); if (nskb == NULL) goto out; /* Synchronize the carrier state so we don't report a state * that we're not actually going to honour immediately; if * the driver just did a carrier off->on transition, we can * only TX if link watch work has run, but without this we'd * already report carrier on, even if it doesn't work yet. */ linkwatch_sync_dev(dev); err = rtnl_fill_ifinfo(nskb, dev, net, RTM_NEWLINK, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, 0, ext_filter_mask, 0, NULL, 0, netnsid, GFP_KERNEL); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size */ WARN_ON(err == -EMSGSIZE); kfree_skb(nskb); } else err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid); out: if (netnsid >= 0) put_net(tgt_net); return err; } static int rtnl_alt_ifname(int cmd, struct net_device *dev, struct nlattr *attr, bool *changed, struct netlink_ext_ack *extack) { char *alt_ifname; size_t size; int err; err = nla_validate(attr, attr->nla_len, IFLA_MAX, ifla_policy, extack); if (err) return err; if (cmd == RTM_NEWLINKPROP) { size = rtnl_prop_list_size(dev); size += nla_total_size(ALTIFNAMSIZ); if (size >= U16_MAX) { NL_SET_ERR_MSG(extack, "effective property list too long"); return -EINVAL; } } alt_ifname = nla_strdup(attr, GFP_KERNEL_ACCOUNT); if (!alt_ifname) return -ENOMEM; if (cmd == RTM_NEWLINKPROP) { err = netdev_name_node_alt_create(dev, alt_ifname); if (!err) alt_ifname = NULL; } else if (cmd == RTM_DELLINKPROP) { err = netdev_name_node_alt_destroy(dev, alt_ifname); } else { WARN_ON_ONCE(1); err = -EINVAL; } kfree(alt_ifname); if (!err) *changed = true; return err; } static int rtnl_linkprop(int cmd, struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tb[IFLA_MAX + 1]; struct net_device *dev; struct ifinfomsg *ifm; bool changed = false; struct nlattr *attr; int err, rem; err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); if (err) return err; err = rtnl_ensure_unique_netns(tb, extack, true); if (err) return err; ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) dev = rtnl_dev_get(net, tb); else return -EINVAL; if (!dev) return -ENODEV; if (!tb[IFLA_PROP_LIST]) return 0; nla_for_each_nested(attr, tb[IFLA_PROP_LIST], rem) { switch (nla_type(attr)) { case IFLA_ALT_IFNAME: err = rtnl_alt_ifname(cmd, dev, attr, &changed, extack); if (err) return err; break; } } if (changed) netdev_state_change(dev); return 0; } static int rtnl_newlinkprop(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { return rtnl_linkprop(RTM_NEWLINKPROP, skb, nlh, extack); } static int rtnl_dellinkprop(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { return rtnl_linkprop(RTM_DELLINKPROP, skb, nlh, extack); } static noinline_for_stack u32 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); size_t min_ifinfo_dump_size = 0; u32 ext_filter_mask = 0; struct net_device *dev; struct nlattr *nla; int hdrlen, rem; /* Same kernel<->userspace interface hack as in rtnl_dump_ifinfo. */ hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ? sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg); if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) return NLMSG_GOODSIZE; nla_for_each_attr_type(nla, IFLA_EXT_MASK, nlmsg_attrdata(nlh, hdrlen), nlmsg_attrlen(nlh, hdrlen), rem) { if (nla_len(nla) == sizeof(u32)) ext_filter_mask = nla_get_u32(nla); } if (!ext_filter_mask) return NLMSG_GOODSIZE; /* * traverse the list of net devices and compute the minimum * buffer size based upon the filter mask. */ rcu_read_lock(); for_each_netdev_rcu(net, dev) { min_ifinfo_dump_size = max(min_ifinfo_dump_size, if_nlmsg_size(dev, ext_filter_mask)); } rcu_read_unlock(); return nlmsg_total_size(min_ifinfo_dump_size); } static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) { int idx; int s_idx = cb->family; int type = cb->nlh->nlmsg_type - RTM_BASE; int ret = 0; if (s_idx == 0) s_idx = 1; for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) { struct rtnl_link __rcu **tab; struct rtnl_link *link; rtnl_dumpit_func dumpit; if (idx < s_idx || idx == PF_PACKET) continue; if (type < 0 || type >= RTM_NR_MSGTYPES) continue; tab = rcu_dereference_rtnl(rtnl_msg_handlers[idx]); if (!tab) continue; link = rcu_dereference_rtnl(tab[type]); if (!link) continue; dumpit = link->dumpit; if (!dumpit) continue; if (idx > s_idx) { memset(&cb->args[0], 0, sizeof(cb->args)); cb->prev_seq = 0; cb->seq = 0; } ret = dumpit(skb, cb); if (ret) break; } cb->family = idx; return skb->len ? : ret; } struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, unsigned int change, u32 event, gfp_t flags, int *new_nsid, int new_ifindex, u32 portid, const struct nlmsghdr *nlh) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; u32 seq = 0; skb = nlmsg_new(if_nlmsg_size(dev, 0), flags); if (skb == NULL) goto errout; if (nlmsg_report(nlh)) seq = nlmsg_seq(nlh); else portid = 0; err = rtnl_fill_ifinfo(skb, dev, dev_net(dev), type, portid, seq, change, 0, 0, event, new_nsid, new_ifindex, -1, flags); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } return skb; errout: rtnl_set_sk_err(net, RTNLGRP_LINK, err); return NULL; } void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags, u32 portid, const struct nlmsghdr *nlh) { struct net *net = dev_net(dev); rtnl_notify(skb, net, portid, RTNLGRP_LINK, nlh, flags); } static void rtmsg_ifinfo_event(int type, struct net_device *dev, unsigned int change, u32 event, gfp_t flags, int *new_nsid, int new_ifindex, u32 portid, const struct nlmsghdr *nlh) { struct sk_buff *skb; if (dev->reg_state != NETREG_REGISTERED) return; skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid, new_ifindex, portid, nlh); if (skb) rtmsg_ifinfo_send(skb, dev, flags, portid, nlh); } void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, gfp_t flags, u32 portid, const struct nlmsghdr *nlh) { rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, NULL, 0, portid, nlh); } void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change, gfp_t flags, int *new_nsid, int new_ifindex) { rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, new_nsid, new_ifindex, 0, NULL); } static int nlmsg_populate_fdb_fill(struct sk_buff *skb, struct net_device *dev, u8 *addr, u16 vid, u32 pid, u32 seq, int type, unsigned int flags, int nlflags, u16 ndm_state) { struct nlmsghdr *nlh; struct ndmsg *ndm; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), nlflags); if (!nlh) return -EMSGSIZE; ndm = nlmsg_data(nlh); ndm->ndm_family = AF_BRIDGE; ndm->ndm_pad1 = 0; ndm->ndm_pad2 = 0; ndm->ndm_flags = flags; ndm->ndm_type = 0; ndm->ndm_ifindex = dev->ifindex; ndm->ndm_state = ndm_state; if (nla_put(skb, NDA_LLADDR, dev->addr_len, addr)) goto nla_put_failure; if (vid) if (nla_put(skb, NDA_VLAN, sizeof(u16), &vid)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static inline size_t rtnl_fdb_nlmsg_size(const struct net_device *dev) { return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(dev->addr_len) + /* NDA_LLADDR */ nla_total_size(sizeof(u16)) + /* NDA_VLAN */ 0; } static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, u16 vid, int type, u16 ndm_state) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(rtnl_fdb_nlmsg_size(dev), GFP_ATOMIC); if (!skb) goto errout; err = nlmsg_populate_fdb_fill(skb, dev, addr, vid, 0, 0, type, NTF_SELF, 0, ndm_state); if (err < 0) { kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); return; errout: rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); } /* * ndo_dflt_fdb_add - default netdevice operation to add an FDB entry */ int ndo_dflt_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 flags) { int err = -EINVAL; /* If aging addresses are supported device will need to * implement its own handler for this. */ if (ndm->ndm_state && !(ndm->ndm_state & NUD_PERMANENT)) { netdev_info(dev, "default FDB implementation only supports local addresses\n"); return err; } if (tb[NDA_FLAGS_EXT]) { netdev_info(dev, "invalid flags given to default FDB implementation\n"); return err; } if (vid) { netdev_info(dev, "vlans aren't supported yet for dev_uc|mc_add()\n"); return err; } if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr)) err = dev_uc_add_excl(dev, addr); else if (is_multicast_ether_addr(addr)) err = dev_mc_add_excl(dev, addr); /* Only return duplicate errors if NLM_F_EXCL is set */ if (err == -EEXIST && !(flags & NLM_F_EXCL)) err = 0; return err; } EXPORT_SYMBOL(ndo_dflt_fdb_add); static int fdb_vid_parse(struct nlattr *vlan_attr, u16 *p_vid, struct netlink_ext_ack *extack) { u16 vid = 0; if (vlan_attr) { if (nla_len(vlan_attr) != sizeof(u16)) { NL_SET_ERR_MSG(extack, "invalid vlan attribute size"); return -EINVAL; } vid = nla_get_u16(vlan_attr); if (!vid || vid >= VLAN_VID_MASK) { NL_SET_ERR_MSG(extack, "invalid vlan id"); return -EINVAL; } } *p_vid = vid; return 0; } static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ndmsg *ndm; struct nlattr *tb[NDA_MAX+1]; struct net_device *dev; u8 *addr; u16 vid; int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack); if (err < 0) return err; ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex == 0) { NL_SET_ERR_MSG(extack, "invalid ifindex"); return -EINVAL; } dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { NL_SET_ERR_MSG(extack, "unknown ifindex"); return -ENODEV; } if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) { NL_SET_ERR_MSG(extack, "invalid address"); return -EINVAL; } if (dev->type != ARPHRD_ETHER) { NL_SET_ERR_MSG(extack, "FDB add only supported for Ethernet devices"); return -EINVAL; } addr = nla_data(tb[NDA_LLADDR]); err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack); if (err) return err; err = -EOPNOTSUPP; /* Support fdb on master device the net/bridge default case */ if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) && netif_is_bridge_port(dev)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); const struct net_device_ops *ops = br_dev->netdev_ops; bool notified = false; err = ops->ndo_fdb_add(ndm, tb, dev, addr, vid, nlh->nlmsg_flags, &notified, extack); if (err) goto out; else ndm->ndm_flags &= ~NTF_MASTER; } /* Embedded bridge, macvlan, and any other device support */ if ((ndm->ndm_flags & NTF_SELF)) { bool notified = false; if (dev->netdev_ops->ndo_fdb_add) err = dev->netdev_ops->ndo_fdb_add(ndm, tb, dev, addr, vid, nlh->nlmsg_flags, &notified, extack); else err = ndo_dflt_fdb_add(ndm, tb, dev, addr, vid, nlh->nlmsg_flags); if (!err && !notified) { rtnl_fdb_notify(dev, addr, vid, RTM_NEWNEIGH, ndm->ndm_state); ndm->ndm_flags &= ~NTF_SELF; } } out: return err; } /* * ndo_dflt_fdb_del - default netdevice operation to delete an FDB entry */ int ndo_dflt_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid) { int err = -EINVAL; /* If aging addresses are supported device will need to * implement its own handler for this. */ if (!(ndm->ndm_state & NUD_PERMANENT)) { netdev_info(dev, "default FDB implementation only supports local addresses\n"); return err; } if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr)) err = dev_uc_del(dev, addr); else if (is_multicast_ether_addr(addr)) err = dev_mc_del(dev, addr); return err; } EXPORT_SYMBOL(ndo_dflt_fdb_del); static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { bool del_bulk = !!(nlh->nlmsg_flags & NLM_F_BULK); struct net *net = sock_net(skb->sk); const struct net_device_ops *ops; struct ndmsg *ndm; struct nlattr *tb[NDA_MAX+1]; struct net_device *dev; __u8 *addr = NULL; int err; u16 vid; if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (!del_bulk) { err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack); } else { /* For bulk delete, the drivers will parse the message with * policy. */ err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack); } if (err < 0) return err; ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex == 0) { NL_SET_ERR_MSG(extack, "invalid ifindex"); return -EINVAL; } dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { NL_SET_ERR_MSG(extack, "unknown ifindex"); return -ENODEV; } if (!del_bulk) { if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) { NL_SET_ERR_MSG(extack, "invalid address"); return -EINVAL; } addr = nla_data(tb[NDA_LLADDR]); err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack); if (err) return err; } if (dev->type != ARPHRD_ETHER) { NL_SET_ERR_MSG(extack, "FDB delete only supported for Ethernet devices"); return -EINVAL; } err = -EOPNOTSUPP; /* Support fdb on master device the net/bridge default case */ if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) && netif_is_bridge_port(dev)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); bool notified = false; ops = br_dev->netdev_ops; if (!del_bulk) { if (ops->ndo_fdb_del) err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, &notified, extack); } else { if (ops->ndo_fdb_del_bulk) err = ops->ndo_fdb_del_bulk(nlh, dev, extack); } if (err) goto out; else ndm->ndm_flags &= ~NTF_MASTER; } /* Embedded bridge, macvlan, and any other device support */ if (ndm->ndm_flags & NTF_SELF) { bool notified = false; ops = dev->netdev_ops; if (!del_bulk) { if (ops->ndo_fdb_del) err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, &notified, extack); else err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid); } else { /* in case err was cleared by NTF_MASTER call */ err = -EOPNOTSUPP; if (ops->ndo_fdb_del_bulk) err = ops->ndo_fdb_del_bulk(nlh, dev, extack); } if (!err) { if (!del_bulk && !notified) rtnl_fdb_notify(dev, addr, vid, RTM_DELNEIGH, ndm->ndm_state); ndm->ndm_flags &= ~NTF_SELF; } } out: return err; } static int nlmsg_populate_fdb(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, int *idx, struct netdev_hw_addr_list *list) { struct ndo_fdb_dump_context *ctx = (void *)cb->ctx; struct netdev_hw_addr *ha; u32 portid, seq; int err; portid = NETLINK_CB(cb->skb).portid; seq = cb->nlh->nlmsg_seq; list_for_each_entry(ha, &list->list, list) { if (*idx < ctx->fdb_idx) goto skip; err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, 0, portid, seq, RTM_NEWNEIGH, NTF_SELF, NLM_F_MULTI, NUD_PERMANENT); if (err < 0) return err; skip: *idx += 1; } return 0; } /** * ndo_dflt_fdb_dump - default netdevice operation to dump an FDB table. * @skb: socket buffer to store message in * @cb: netlink callback * @dev: netdevice * @filter_dev: ignored * @idx: the number of FDB table entries dumped is added to *@idx * * Default netdevice operation to dump the existing unicast address list. * Returns number of addresses from list put in skb. */ int ndo_dflt_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, int *idx) { int err; if (dev->type != ARPHRD_ETHER) return -EINVAL; netif_addr_lock_bh(dev); err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->uc); if (err) goto out; err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->mc); out: netif_addr_unlock_bh(dev); return err; } EXPORT_SYMBOL(ndo_dflt_fdb_dump); static int valid_fdb_dump_strict(const struct nlmsghdr *nlh, int *br_idx, int *brport_idx, struct netlink_ext_ack *extack) { struct nlattr *tb[NDA_MAX + 1]; struct ndmsg *ndm; int err, i; ndm = nlmsg_payload(nlh, sizeof(*ndm)); if (!ndm) { NL_SET_ERR_MSG(extack, "Invalid header for fdb dump request"); return -EINVAL; } if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || ndm->ndm_flags || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for fdb dump request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, NULL, extack); if (err < 0) return err; *brport_idx = ndm->ndm_ifindex; for (i = 0; i <= NDA_MAX; ++i) { if (!tb[i]) continue; switch (i) { case NDA_IFINDEX: if (nla_len(tb[i]) != sizeof(u32)) { NL_SET_ERR_MSG(extack, "Invalid IFINDEX attribute in fdb dump request"); return -EINVAL; } *brport_idx = nla_get_u32(tb[NDA_IFINDEX]); break; case NDA_MASTER: if (nla_len(tb[i]) != sizeof(u32)) { NL_SET_ERR_MSG(extack, "Invalid MASTER attribute in fdb dump request"); return -EINVAL; } *br_idx = nla_get_u32(tb[NDA_MASTER]); break; default: NL_SET_ERR_MSG(extack, "Unsupported attribute in fdb dump request"); return -EINVAL; } } return 0; } static int valid_fdb_dump_legacy(const struct nlmsghdr *nlh, int *br_idx, int *brport_idx, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_MAX+1]; int err; /* A hack to preserve kernel<->userspace interface. * Before Linux v4.12 this code accepted ndmsg since iproute2 v3.3.0. * However, ndmsg is shorter than ifinfomsg thus nlmsg_parse() bails. * So, check for ndmsg with an optional u32 attribute (not used here). * Fortunately these sizes don't conflict with the size of ifinfomsg * with an optional attribute. */ if (nlmsg_len(nlh) != sizeof(struct ndmsg) && (nlmsg_len(nlh) != sizeof(struct ndmsg) + nla_attr_size(sizeof(u32)))) { struct ifinfomsg *ifm; err = nlmsg_parse_deprecated(nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, ifla_policy, extack); if (err < 0) { return -EINVAL; } else if (err == 0) { if (tb[IFLA_MASTER]) *br_idx = nla_get_u32(tb[IFLA_MASTER]); } ifm = nlmsg_data(nlh); *brport_idx = ifm->ifi_index; } return 0; } static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) { const struct net_device_ops *ops = NULL, *cops = NULL; struct ndo_fdb_dump_context *ctx = (void *)cb->ctx; struct net_device *dev, *br_dev = NULL; struct net *net = sock_net(skb->sk); int brport_idx = 0; int br_idx = 0; int fidx = 0; int err; NL_ASSERT_CTX_FITS(struct ndo_fdb_dump_context); if (cb->strict_check) err = valid_fdb_dump_strict(cb->nlh, &br_idx, &brport_idx, cb->extack); else err = valid_fdb_dump_legacy(cb->nlh, &br_idx, &brport_idx, cb->extack); if (err < 0) return err; if (br_idx) { br_dev = __dev_get_by_index(net, br_idx); if (!br_dev) return -ENODEV; ops = br_dev->netdev_ops; } for_each_netdev_dump(net, dev, ctx->ifindex) { if (brport_idx && (dev->ifindex != brport_idx)) continue; if (!br_idx) { /* user did not specify a specific bridge */ if (netif_is_bridge_port(dev)) { br_dev = netdev_master_upper_dev_get(dev); cops = br_dev->netdev_ops; } } else { if (dev != br_dev && !netif_is_bridge_port(dev)) continue; if (br_dev != netdev_master_upper_dev_get(dev) && !netif_is_bridge_master(dev)) continue; cops = ops; } if (netif_is_bridge_port(dev)) { if (cops && cops->ndo_fdb_dump) { err = cops->ndo_fdb_dump(skb, cb, br_dev, dev, &fidx); if (err == -EMSGSIZE) break; } } if (dev->netdev_ops->ndo_fdb_dump) err = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL, &fidx); else err = ndo_dflt_fdb_dump(skb, cb, dev, NULL, &fidx); if (err == -EMSGSIZE) break; cops = NULL; /* reset fdb offset to 0 for rest of the interfaces */ ctx->fdb_idx = 0; fidx = 0; } ctx->fdb_idx = fidx; return skb->len; } static int valid_fdb_get_strict(const struct nlmsghdr *nlh, struct nlattr **tb, u8 *ndm_flags, int *br_idx, int *brport_idx, u8 **addr, u16 *vid, struct netlink_ext_ack *extack) { struct ndmsg *ndm; int err, i; ndm = nlmsg_payload(nlh, sizeof(*ndm)); if (!ndm) { NL_SET_ERR_MSG(extack, "Invalid header for fdb get request"); return -EINVAL; } if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for fdb get request"); return -EINVAL; } if (ndm->ndm_flags & ~(NTF_MASTER | NTF_SELF)) { NL_SET_ERR_MSG(extack, "Invalid flags in header for fdb get request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, nda_policy, extack); if (err < 0) return err; *ndm_flags = ndm->ndm_flags; *brport_idx = ndm->ndm_ifindex; for (i = 0; i <= NDA_MAX; ++i) { if (!tb[i]) continue; switch (i) { case NDA_MASTER: *br_idx = nla_get_u32(tb[i]); break; case NDA_LLADDR: if (nla_len(tb[i]) != ETH_ALEN) { NL_SET_ERR_MSG(extack, "Invalid address in fdb get request"); return -EINVAL; } *addr = nla_data(tb[i]); break; case NDA_VLAN: err = fdb_vid_parse(tb[i], vid, extack); if (err) return err; break; case NDA_VNI: break; default: NL_SET_ERR_MSG(extack, "Unsupported attribute in fdb get request"); return -EINVAL; } } return 0; } static int rtnl_fdb_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net_device *dev = NULL, *br_dev = NULL; const struct net_device_ops *ops = NULL; struct net *net = sock_net(in_skb->sk); struct nlattr *tb[NDA_MAX + 1]; struct sk_buff *skb; int brport_idx = 0; u8 ndm_flags = 0; int br_idx = 0; u8 *addr = NULL; u16 vid = 0; int err; err = valid_fdb_get_strict(nlh, tb, &ndm_flags, &br_idx, &brport_idx, &addr, &vid, extack); if (err < 0) return err; if (!addr) { NL_SET_ERR_MSG(extack, "Missing lookup address for fdb get request"); return -EINVAL; } if (brport_idx) { dev = __dev_get_by_index(net, brport_idx); if (!dev) { NL_SET_ERR_MSG(extack, "Unknown device ifindex"); return -ENODEV; } } if (br_idx) { if (dev) { NL_SET_ERR_MSG(extack, "Master and device are mutually exclusive"); return -EINVAL; } br_dev = __dev_get_by_index(net, br_idx); if (!br_dev) { NL_SET_ERR_MSG(extack, "Invalid master ifindex"); return -EINVAL; } ops = br_dev->netdev_ops; } if (dev) { if (!ndm_flags || (ndm_flags & NTF_MASTER)) { if (!netif_is_bridge_port(dev)) { NL_SET_ERR_MSG(extack, "Device is not a bridge port"); return -EINVAL; } br_dev = netdev_master_upper_dev_get(dev); if (!br_dev) { NL_SET_ERR_MSG(extack, "Master of device not found"); return -EINVAL; } ops = br_dev->netdev_ops; } else { if (!(ndm_flags & NTF_SELF)) { NL_SET_ERR_MSG(extack, "Missing NTF_SELF"); return -EINVAL; } ops = dev->netdev_ops; } } if (!br_dev && !dev) { NL_SET_ERR_MSG(extack, "No device specified"); return -ENODEV; } if (!ops || !ops->ndo_fdb_get) { NL_SET_ERR_MSG(extack, "Fdb get operation not supported by device"); return -EOPNOTSUPP; } skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; if (br_dev) dev = br_dev; err = ops->ndo_fdb_get(skb, tb, dev, addr, vid, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, extack); if (err) goto out; return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); out: kfree_skb(skb); return err; } static int brport_nla_put_flag(struct sk_buff *skb, u32 flags, u32 mask, unsigned int attrnum, unsigned int flag) { if (mask & flag) return nla_put_u8(skb, attrnum, !!(flags & flag)); return 0; } int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev, u16 mode, u32 flags, u32 mask, int nlflags, u32 filter_mask, int (*vlan_fill)(struct sk_buff *skb, struct net_device *dev, u32 filter_mask)) { struct nlmsghdr *nlh; struct ifinfomsg *ifm; struct nlattr *br_afspec; struct nlattr *protinfo; u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN; struct net_device *br_dev = netdev_master_upper_dev_get(dev); int err = 0; nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), nlflags); if (nlh == NULL) return -EMSGSIZE; ifm = nlmsg_data(nlh); ifm->ifi_family = AF_BRIDGE; ifm->__ifi_pad = 0; ifm->ifi_type = dev->type; ifm->ifi_index = dev->ifindex; ifm->ifi_flags = dev_get_flags(dev); ifm->ifi_change = 0; if (nla_put_string(skb, IFLA_IFNAME, dev->name) || nla_put_u32(skb, IFLA_MTU, dev->mtu) || nla_put_u8(skb, IFLA_OPERSTATE, operstate) || (br_dev && nla_put_u32(skb, IFLA_MASTER, br_dev->ifindex)) || (dev->addr_len && nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) || (dev->ifindex != dev_get_iflink(dev) && nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev)))) goto nla_put_failure; br_afspec = nla_nest_start_noflag(skb, IFLA_AF_SPEC); if (!br_afspec) goto nla_put_failure; if (nla_put_u16(skb, IFLA_BRIDGE_FLAGS, BRIDGE_FLAGS_SELF)) { nla_nest_cancel(skb, br_afspec); goto nla_put_failure; } if (mode != BRIDGE_MODE_UNDEF) { if (nla_put_u16(skb, IFLA_BRIDGE_MODE, mode)) { nla_nest_cancel(skb, br_afspec); goto nla_put_failure; } } if (vlan_fill) { err = vlan_fill(skb, dev, filter_mask); if (err) { nla_nest_cancel(skb, br_afspec); goto nla_put_failure; } } nla_nest_end(skb, br_afspec); protinfo = nla_nest_start(skb, IFLA_PROTINFO); if (!protinfo) goto nla_put_failure; if (brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_MODE, BR_HAIRPIN_MODE) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_GUARD, BR_BPDU_GUARD) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_FAST_LEAVE, BR_MULTICAST_FAST_LEAVE) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_LEARNING, BR_LEARNING) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_LEARNING_SYNC, BR_LEARNING_SYNC) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_PROXYARP, BR_PROXYARP) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_BCAST_FLOOD, BR_BCAST_FLOOD)) { nla_nest_cancel(skb, protinfo); goto nla_put_failure; } nla_nest_end(skb, protinfo); nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return err ? err : -EMSGSIZE; } EXPORT_SYMBOL_GPL(ndo_dflt_bridge_getlink); static int valid_bridge_getlink_req(const struct nlmsghdr *nlh, bool strict_check, u32 *filter_mask, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_MAX+1]; int err, i; if (strict_check) { struct ifinfomsg *ifm; ifm = nlmsg_payload(nlh, sizeof(*ifm)); if (!ifm) { NL_SET_ERR_MSG(extack, "Invalid header for bridge link dump"); return -EINVAL; } if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change || ifm->ifi_index) { NL_SET_ERR_MSG(extack, "Invalid values in header for bridge link dump request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, ifla_policy, extack); } else { err = nlmsg_parse_deprecated(nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, ifla_policy, extack); } if (err < 0) return err; /* new attributes should only be added with strict checking */ for (i = 0; i <= IFLA_MAX; ++i) { if (!tb[i]) continue; switch (i) { case IFLA_EXT_MASK: *filter_mask = nla_get_u32(tb[i]); break; default: if (strict_check) { NL_SET_ERR_MSG(extack, "Unsupported attribute in bridge link dump request"); return -EINVAL; } } } return 0; } static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct net_device *dev; int idx = 0; u32 portid = NETLINK_CB(cb->skb).portid; u32 seq = nlh->nlmsg_seq; u32 filter_mask = 0; int err; err = valid_bridge_getlink_req(nlh, cb->strict_check, &filter_mask, cb->extack); if (err < 0 && cb->strict_check) return err; rcu_read_lock(); for_each_netdev_rcu(net, dev) { const struct net_device_ops *ops = dev->netdev_ops; struct net_device *br_dev = netdev_master_upper_dev_get(dev); if (br_dev && br_dev->netdev_ops->ndo_bridge_getlink) { if (idx >= cb->args[0]) { err = br_dev->netdev_ops->ndo_bridge_getlink( skb, portid, seq, dev, filter_mask, NLM_F_MULTI); if (err < 0 && err != -EOPNOTSUPP) { if (likely(skb->len)) break; goto out_err; } } idx++; } if (ops->ndo_bridge_getlink) { if (idx >= cb->args[0]) { err = ops->ndo_bridge_getlink(skb, portid, seq, dev, filter_mask, NLM_F_MULTI); if (err < 0 && err != -EOPNOTSUPP) { if (likely(skb->len)) break; goto out_err; } } idx++; } } err = skb->len; out_err: rcu_read_unlock(); cb->args[0] = idx; return err; } static inline size_t bridge_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ifinfomsg)) + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ + nla_total_size(sizeof(u32)) /* IFLA_MASTER */ + nla_total_size(sizeof(u32)) /* IFLA_MTU */ + nla_total_size(sizeof(u32)) /* IFLA_LINK */ + nla_total_size(sizeof(u32)) /* IFLA_OPERSTATE */ + nla_total_size(sizeof(u8)) /* IFLA_PROTINFO */ + nla_total_size(sizeof(struct nlattr)) /* IFLA_AF_SPEC */ + nla_total_size(sizeof(u16)) /* IFLA_BRIDGE_FLAGS */ + nla_total_size(sizeof(u16)); /* IFLA_BRIDGE_MODE */ } static int rtnl_bridge_notify(struct net_device *dev) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -EOPNOTSUPP; if (!dev->netdev_ops->ndo_bridge_getlink) return 0; skb = nlmsg_new(bridge_nlmsg_size(), GFP_ATOMIC); if (!skb) { err = -ENOMEM; goto errout; } err = dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev, 0, 0); if (err < 0) goto errout; /* Notification info is only filled for bridge ports, not the bridge * device itself. Therefore, a zero notification length is valid and * should not result in an error. */ if (!skb->len) goto errout; rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC); return 0; errout: WARN_ON(err == -EMSGSIZE); kfree_skb(skb); if (err) rtnl_set_sk_err(net, RTNLGRP_LINK, err); return err; } static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ifinfomsg *ifm; struct net_device *dev; struct nlattr *br_spec, *attr, *br_flags_attr = NULL; int rem, err = -EOPNOTSUPP; u16 flags = 0; if (nlmsg_len(nlh) < sizeof(*ifm)) return -EINVAL; ifm = nlmsg_data(nlh); if (ifm->ifi_family != AF_BRIDGE) return -EPFNOSUPPORT; dev = __dev_get_by_index(net, ifm->ifi_index); if (!dev) { NL_SET_ERR_MSG(extack, "unknown ifindex"); return -ENODEV; } br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); if (br_spec) { nla_for_each_nested(attr, br_spec, rem) { if (nla_type(attr) == IFLA_BRIDGE_FLAGS && !br_flags_attr) { if (nla_len(attr) < sizeof(flags)) return -EINVAL; br_flags_attr = attr; flags = nla_get_u16(attr); } if (nla_type(attr) == IFLA_BRIDGE_MODE) { if (nla_len(attr) < sizeof(u16)) return -EINVAL; } } } if (!flags || (flags & BRIDGE_FLAGS_MASTER)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); if (!br_dev || !br_dev->netdev_ops->ndo_bridge_setlink) { err = -EOPNOTSUPP; goto out; } err = br_dev->netdev_ops->ndo_bridge_setlink(dev, nlh, flags, extack); if (err) goto out; flags &= ~BRIDGE_FLAGS_MASTER; } if ((flags & BRIDGE_FLAGS_SELF)) { if (!dev->netdev_ops->ndo_bridge_setlink) err = -EOPNOTSUPP; else err = dev->netdev_ops->ndo_bridge_setlink(dev, nlh, flags, extack); if (!err) { flags &= ~BRIDGE_FLAGS_SELF; /* Generate event to notify upper layer of bridge * change */ err = rtnl_bridge_notify(dev); } } if (br_flags_attr) memcpy(nla_data(br_flags_attr), &flags, sizeof(flags)); out: return err; } static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ifinfomsg *ifm; struct net_device *dev; struct nlattr *br_spec, *attr = NULL; int rem, err = -EOPNOTSUPP; u16 flags = 0; bool have_flags = false; if (nlmsg_len(nlh) < sizeof(*ifm)) return -EINVAL; ifm = nlmsg_data(nlh); if (ifm->ifi_family != AF_BRIDGE) return -EPFNOSUPPORT; dev = __dev_get_by_index(net, ifm->ifi_index); if (!dev) { NL_SET_ERR_MSG(extack, "unknown ifindex"); return -ENODEV; } br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); if (br_spec) { nla_for_each_nested_type(attr, IFLA_BRIDGE_FLAGS, br_spec, rem) { if (nla_len(attr) < sizeof(flags)) return -EINVAL; have_flags = true; flags = nla_get_u16(attr); break; } } if (!flags || (flags & BRIDGE_FLAGS_MASTER)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); if (!br_dev || !br_dev->netdev_ops->ndo_bridge_dellink) { err = -EOPNOTSUPP; goto out; } err = br_dev->netdev_ops->ndo_bridge_dellink(dev, nlh, flags); if (err) goto out; flags &= ~BRIDGE_FLAGS_MASTER; } if ((flags & BRIDGE_FLAGS_SELF)) { if (!dev->netdev_ops->ndo_bridge_dellink) err = -EOPNOTSUPP; else err = dev->netdev_ops->ndo_bridge_dellink(dev, nlh, flags); if (!err) { flags &= ~BRIDGE_FLAGS_SELF; /* Generate event to notify upper layer of bridge * change */ err = rtnl_bridge_notify(dev); } } if (have_flags) memcpy(nla_data(attr), &flags, sizeof(flags)); out: return err; } static bool stats_attr_valid(unsigned int mask, int attrid, int idxattr) { return (mask & IFLA_STATS_FILTER_BIT(attrid)) && (!idxattr || idxattr == attrid); } static bool rtnl_offload_xstats_have_ndo(const struct net_device *dev, int attr_id) { return dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats && dev->netdev_ops->ndo_get_offload_stats && dev->netdev_ops->ndo_has_offload_stats(dev, attr_id); } static unsigned int rtnl_offload_xstats_get_size_ndo(const struct net_device *dev, int attr_id) { return rtnl_offload_xstats_have_ndo(dev, attr_id) ? sizeof(struct rtnl_link_stats64) : 0; } static int rtnl_offload_xstats_fill_ndo(struct net_device *dev, int attr_id, struct sk_buff *skb) { unsigned int size = rtnl_offload_xstats_get_size_ndo(dev, attr_id); struct nlattr *attr = NULL; void *attr_data; int err; if (!size) return -ENODATA; attr = nla_reserve_64bit(skb, attr_id, size, IFLA_OFFLOAD_XSTATS_UNSPEC); if (!attr) return -EMSGSIZE; attr_data = nla_data(attr); memset(attr_data, 0, size); err = dev->netdev_ops->ndo_get_offload_stats(attr_id, dev, attr_data); if (err) return err; return 0; } static unsigned int rtnl_offload_xstats_get_size_stats(const struct net_device *dev, enum netdev_offload_xstats_type type) { bool enabled = netdev_offload_xstats_enabled(dev, type); return enabled ? sizeof(struct rtnl_hw_stats64) : 0; } struct rtnl_offload_xstats_request_used { bool request; bool used; }; static int rtnl_offload_xstats_get_stats(struct net_device *dev, enum netdev_offload_xstats_type type, struct rtnl_offload_xstats_request_used *ru, struct rtnl_hw_stats64 *stats, struct netlink_ext_ack *extack) { bool request; bool used; int err; request = netdev_offload_xstats_enabled(dev, type); if (!request) { used = false; goto out; } err = netdev_offload_xstats_get(dev, type, stats, &used, extack); if (err) return err; out: if (ru) { ru->request = request; ru->used = used; } return 0; } static int rtnl_offload_xstats_fill_hw_s_info_one(struct sk_buff *skb, int attr_id, struct rtnl_offload_xstats_request_used *ru) { struct nlattr *nest; nest = nla_nest_start(skb, attr_id); if (!nest) return -EMSGSIZE; if (nla_put_u8(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST, ru->request)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED, ru->used)) goto nla_put_failure; nla_nest_end(skb, nest); return 0; nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int rtnl_offload_xstats_fill_hw_s_info(struct sk_buff *skb, struct net_device *dev, struct netlink_ext_ack *extack) { enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3; struct rtnl_offload_xstats_request_used ru_l3; struct nlattr *nest; int err; err = rtnl_offload_xstats_get_stats(dev, t_l3, &ru_l3, NULL, extack); if (err) return err; nest = nla_nest_start(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO); if (!nest) return -EMSGSIZE; if (rtnl_offload_xstats_fill_hw_s_info_one(skb, IFLA_OFFLOAD_XSTATS_L3_STATS, &ru_l3)) goto nla_put_failure; nla_nest_end(skb, nest); return 0; nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int rtnl_offload_xstats_fill(struct sk_buff *skb, struct net_device *dev, int *prividx, u32 off_filter_mask, struct netlink_ext_ack *extack) { enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3; int attr_id_hw_s_info = IFLA_OFFLOAD_XSTATS_HW_S_INFO; int attr_id_l3_stats = IFLA_OFFLOAD_XSTATS_L3_STATS; int attr_id_cpu_hit = IFLA_OFFLOAD_XSTATS_CPU_HIT; bool have_data = false; int err; if (*prividx <= attr_id_cpu_hit && (off_filter_mask & IFLA_STATS_FILTER_BIT(attr_id_cpu_hit))) { err = rtnl_offload_xstats_fill_ndo(dev, attr_id_cpu_hit, skb); if (!err) { have_data = true; } else if (err != -ENODATA) { *prividx = attr_id_cpu_hit; return err; } } if (*prividx <= attr_id_hw_s_info && (off_filter_mask & IFLA_STATS_FILTER_BIT(attr_id_hw_s_info))) { *prividx = attr_id_hw_s_info; err = rtnl_offload_xstats_fill_hw_s_info(skb, dev, extack); if (err) return err; have_data = true; *prividx = 0; } if (*prividx <= attr_id_l3_stats && (off_filter_mask & IFLA_STATS_FILTER_BIT(attr_id_l3_stats))) { unsigned int size_l3; struct nlattr *attr; *prividx = attr_id_l3_stats; size_l3 = rtnl_offload_xstats_get_size_stats(dev, t_l3); if (!size_l3) goto skip_l3_stats; attr = nla_reserve_64bit(skb, attr_id_l3_stats, size_l3, IFLA_OFFLOAD_XSTATS_UNSPEC); if (!attr) return -EMSGSIZE; err = rtnl_offload_xstats_get_stats(dev, t_l3, NULL, nla_data(attr), extack); if (err) return err; have_data = true; skip_l3_stats: *prividx = 0; } if (!have_data) return -ENODATA; *prividx = 0; return 0; } static unsigned int rtnl_offload_xstats_get_size_hw_s_info_one(const struct net_device *dev, enum netdev_offload_xstats_type type) { return nla_total_size(0) + /* IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST */ nla_total_size(sizeof(u8)) + /* IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED */ nla_total_size(sizeof(u8)) + 0; } static unsigned int rtnl_offload_xstats_get_size_hw_s_info(const struct net_device *dev) { enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3; return nla_total_size(0) + /* IFLA_OFFLOAD_XSTATS_L3_STATS */ rtnl_offload_xstats_get_size_hw_s_info_one(dev, t_l3) + 0; } static int rtnl_offload_xstats_get_size(const struct net_device *dev, u32 off_filter_mask) { enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3; int attr_id_cpu_hit = IFLA_OFFLOAD_XSTATS_CPU_HIT; int nla_size = 0; int size; if (off_filter_mask & IFLA_STATS_FILTER_BIT(attr_id_cpu_hit)) { size = rtnl_offload_xstats_get_size_ndo(dev, attr_id_cpu_hit); nla_size += nla_total_size_64bit(size); } if (off_filter_mask & IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO)) nla_size += rtnl_offload_xstats_get_size_hw_s_info(dev); if (off_filter_mask & IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_L3_STATS)) { size = rtnl_offload_xstats_get_size_stats(dev, t_l3); nla_size += nla_total_size_64bit(size); } if (nla_size != 0) nla_size += nla_total_size(0); return nla_size; } struct rtnl_stats_dump_filters { /* mask[0] filters outer attributes. Then individual nests have their * filtering mask at the index of the nested attribute. */ u32 mask[IFLA_STATS_MAX + 1]; }; static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, unsigned int flags, const struct rtnl_stats_dump_filters *filters, int *idxattr, int *prividx, struct netlink_ext_ack *extack) { unsigned int filter_mask = filters->mask[0]; struct if_stats_msg *ifsm; struct nlmsghdr *nlh; struct nlattr *attr; int s_prividx = *prividx; int err; ASSERT_RTNL(); nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifsm), flags); if (!nlh) return -EMSGSIZE; ifsm = nlmsg_data(nlh); ifsm->family = PF_UNSPEC; ifsm->pad1 = 0; ifsm->pad2 = 0; ifsm->ifindex = dev->ifindex; ifsm->filter_mask = filter_mask; if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, *idxattr)) { struct rtnl_link_stats64 *sp; attr = nla_reserve_64bit(skb, IFLA_STATS_LINK_64, sizeof(struct rtnl_link_stats64), IFLA_STATS_UNSPEC); if (!attr) { err = -EMSGSIZE; goto nla_put_failure; } sp = nla_data(attr); dev_get_stats(dev, sp); } if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS, *idxattr)) { const struct rtnl_link_ops *ops = dev->rtnl_link_ops; if (ops && ops->fill_linkxstats) { *idxattr = IFLA_STATS_LINK_XSTATS; attr = nla_nest_start_noflag(skb, IFLA_STATS_LINK_XSTATS); if (!attr) { err = -EMSGSIZE; goto nla_put_failure; } err = ops->fill_linkxstats(skb, dev, prividx, *idxattr); nla_nest_end(skb, attr); if (err) goto nla_put_failure; *idxattr = 0; } } if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS_SLAVE, *idxattr)) { const struct rtnl_link_ops *ops = NULL; const struct net_device *master; master = netdev_master_upper_dev_get(dev); if (master) ops = master->rtnl_link_ops; if (ops && ops->fill_linkxstats) { *idxattr = IFLA_STATS_LINK_XSTATS_SLAVE; attr = nla_nest_start_noflag(skb, IFLA_STATS_LINK_XSTATS_SLAVE); if (!attr) { err = -EMSGSIZE; goto nla_put_failure; } err = ops->fill_linkxstats(skb, dev, prividx, *idxattr); nla_nest_end(skb, attr); if (err) goto nla_put_failure; *idxattr = 0; } } if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, *idxattr)) { u32 off_filter_mask; off_filter_mask = filters->mask[IFLA_STATS_LINK_OFFLOAD_XSTATS]; *idxattr = IFLA_STATS_LINK_OFFLOAD_XSTATS; attr = nla_nest_start_noflag(skb, IFLA_STATS_LINK_OFFLOAD_XSTATS); if (!attr) { err = -EMSGSIZE; goto nla_put_failure; } err = rtnl_offload_xstats_fill(skb, dev, prividx, off_filter_mask, extack); if (err == -ENODATA) nla_nest_cancel(skb, attr); else nla_nest_end(skb, attr); if (err && err != -ENODATA) goto nla_put_failure; *idxattr = 0; } if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, *idxattr)) { struct rtnl_af_ops *af_ops; *idxattr = IFLA_STATS_AF_SPEC; attr = nla_nest_start_noflag(skb, IFLA_STATS_AF_SPEC); if (!attr) { err = -EMSGSIZE; goto nla_put_failure; } rcu_read_lock(); list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { if (af_ops->fill_stats_af) { struct nlattr *af; af = nla_nest_start_noflag(skb, af_ops->family); if (!af) { rcu_read_unlock(); err = -EMSGSIZE; goto nla_put_failure; } err = af_ops->fill_stats_af(skb, dev); if (err == -ENODATA) { nla_nest_cancel(skb, af); } else if (err < 0) { rcu_read_unlock(); goto nla_put_failure; } nla_nest_end(skb, af); } } rcu_read_unlock(); nla_nest_end(skb, attr); *idxattr = 0; } nlmsg_end(skb, nlh); return 0; nla_put_failure: /* not a multi message or no progress mean a real error */ if (!(flags & NLM_F_MULTI) || s_prividx == *prividx) nlmsg_cancel(skb, nlh); else nlmsg_end(skb, nlh); return err; } static size_t if_nlmsg_stats_size(const struct net_device *dev, const struct rtnl_stats_dump_filters *filters) { size_t size = NLMSG_ALIGN(sizeof(struct if_stats_msg)); unsigned int filter_mask = filters->mask[0]; if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, 0)) size += nla_total_size_64bit(sizeof(struct rtnl_link_stats64)); if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS, 0)) { const struct rtnl_link_ops *ops = dev->rtnl_link_ops; int attr = IFLA_STATS_LINK_XSTATS; if (ops && ops->get_linkxstats_size) { size += nla_total_size(ops->get_linkxstats_size(dev, attr)); /* for IFLA_STATS_LINK_XSTATS */ size += nla_total_size(0); } } if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS_SLAVE, 0)) { struct net_device *_dev = (struct net_device *)dev; const struct rtnl_link_ops *ops = NULL; const struct net_device *master; /* netdev_master_upper_dev_get can't take const */ master = netdev_master_upper_dev_get(_dev); if (master) ops = master->rtnl_link_ops; if (ops && ops->get_linkxstats_size) { int attr = IFLA_STATS_LINK_XSTATS_SLAVE; size += nla_total_size(ops->get_linkxstats_size(dev, attr)); /* for IFLA_STATS_LINK_XSTATS_SLAVE */ size += nla_total_size(0); } } if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0)) { u32 off_filter_mask; off_filter_mask = filters->mask[IFLA_STATS_LINK_OFFLOAD_XSTATS]; size += rtnl_offload_xstats_get_size(dev, off_filter_mask); } if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, 0)) { struct rtnl_af_ops *af_ops; /* for IFLA_STATS_AF_SPEC */ size += nla_total_size(0); rcu_read_lock(); list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { if (af_ops->get_stats_af_size) { size += nla_total_size( af_ops->get_stats_af_size(dev)); /* for AF_* */ size += nla_total_size(0); } } rcu_read_unlock(); } return size; } #define RTNL_STATS_OFFLOAD_XSTATS_VALID ((1 << __IFLA_OFFLOAD_XSTATS_MAX) - 1) static const struct nla_policy rtnl_stats_get_policy_filters[IFLA_STATS_MAX + 1] = { [IFLA_STATS_LINK_OFFLOAD_XSTATS] = NLA_POLICY_MASK(NLA_U32, RTNL_STATS_OFFLOAD_XSTATS_VALID), }; static const struct nla_policy rtnl_stats_get_policy[IFLA_STATS_GETSET_MAX + 1] = { [IFLA_STATS_GET_FILTERS] = NLA_POLICY_NESTED(rtnl_stats_get_policy_filters), }; static const struct nla_policy ifla_stats_set_policy[IFLA_STATS_GETSET_MAX + 1] = { [IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS] = NLA_POLICY_MAX(NLA_U8, 1), }; static int rtnl_stats_get_parse_filters(struct nlattr *ifla_filters, struct rtnl_stats_dump_filters *filters, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_STATS_MAX + 1]; int err; int at; err = nla_parse_nested(tb, IFLA_STATS_MAX, ifla_filters, rtnl_stats_get_policy_filters, extack); if (err < 0) return err; for (at = 1; at <= IFLA_STATS_MAX; at++) { if (tb[at]) { if (!(filters->mask[0] & IFLA_STATS_FILTER_BIT(at))) { NL_SET_ERR_MSG(extack, "Filtered attribute not enabled in filter_mask"); return -EINVAL; } filters->mask[at] = nla_get_u32(tb[at]); } } return 0; } static int rtnl_stats_get_parse(const struct nlmsghdr *nlh, u32 filter_mask, struct rtnl_stats_dump_filters *filters, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_STATS_GETSET_MAX + 1]; int err; int i; filters->mask[0] = filter_mask; for (i = 1; i < ARRAY_SIZE(filters->mask); i++) filters->mask[i] = -1U; err = nlmsg_parse(nlh, sizeof(struct if_stats_msg), tb, IFLA_STATS_GETSET_MAX, rtnl_stats_get_policy, extack); if (err < 0) return err; if (tb[IFLA_STATS_GET_FILTERS]) { err = rtnl_stats_get_parse_filters(tb[IFLA_STATS_GET_FILTERS], filters, extack); if (err) return err; } return 0; } static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check, bool is_dump, struct netlink_ext_ack *extack) { struct if_stats_msg *ifsm; ifsm = nlmsg_payload(nlh, sizeof(*ifsm)); if (!ifsm) { NL_SET_ERR_MSG(extack, "Invalid header for stats dump"); return -EINVAL; } if (!strict_check) return 0; /* only requests using strict checks can pass data to influence * the dump. The legacy exception is filter_mask. */ if (ifsm->pad1 || ifsm->pad2 || (is_dump && ifsm->ifindex)) { NL_SET_ERR_MSG(extack, "Invalid values in header for stats dump request"); return -EINVAL; } if (ifsm->filter_mask >= IFLA_STATS_FILTER_BIT(IFLA_STATS_MAX + 1)) { NL_SET_ERR_MSG(extack, "Invalid stats requested through filter mask"); return -EINVAL; } return 0; } static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct rtnl_stats_dump_filters filters; struct net *net = sock_net(skb->sk); struct net_device *dev = NULL; int idxattr = 0, prividx = 0; struct if_stats_msg *ifsm; struct sk_buff *nskb; int err; err = rtnl_valid_stats_req(nlh, netlink_strict_get_check(skb), false, extack); if (err) return err; ifsm = nlmsg_data(nlh); if (ifsm->ifindex > 0) dev = __dev_get_by_index(net, ifsm->ifindex); else return -EINVAL; if (!dev) return -ENODEV; if (!ifsm->filter_mask) { NL_SET_ERR_MSG(extack, "Filter mask must be set for stats get"); return -EINVAL; } err = rtnl_stats_get_parse(nlh, ifsm->filter_mask, &filters, extack); if (err) return err; nskb = nlmsg_new(if_nlmsg_stats_size(dev, &filters), GFP_KERNEL); if (!nskb) return -ENOBUFS; err = rtnl_fill_statsinfo(nskb, dev, RTM_NEWSTATS, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, 0, &filters, &idxattr, &prividx, extack); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_stats_size */ WARN_ON(err == -EMSGSIZE); kfree_skb(nskb); } else { err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid); } return err; } static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; struct rtnl_stats_dump_filters filters; struct net *net = sock_net(skb->sk); unsigned int flags = NLM_F_MULTI; struct if_stats_msg *ifsm; struct { unsigned long ifindex; int idxattr; int prividx; } *ctx = (void *)cb->ctx; struct net_device *dev; int err; cb->seq = net->dev_base_seq; err = rtnl_valid_stats_req(cb->nlh, cb->strict_check, true, extack); if (err) return err; ifsm = nlmsg_data(cb->nlh); if (!ifsm->filter_mask) { NL_SET_ERR_MSG(extack, "Filter mask must be set for stats dump"); return -EINVAL; } err = rtnl_stats_get_parse(cb->nlh, ifsm->filter_mask, &filters, extack); if (err) return err; for_each_netdev_dump(net, dev, ctx->ifindex) { err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 0, flags, &filters, &ctx->idxattr, &ctx->prividx, extack); /* If we ran out of room on the first message, * we're in trouble. */ WARN_ON((err == -EMSGSIZE) && (skb->len == 0)); if (err < 0) break; ctx->prividx = 0; ctx->idxattr = 0; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); } return err; } void rtnl_offload_xstats_notify(struct net_device *dev) { struct rtnl_stats_dump_filters response_filters = {}; struct net *net = dev_net(dev); int idxattr = 0, prividx = 0; struct sk_buff *skb; int err = -ENOBUFS; ASSERT_RTNL(); response_filters.mask[0] |= IFLA_STATS_FILTER_BIT(IFLA_STATS_LINK_OFFLOAD_XSTATS); response_filters.mask[IFLA_STATS_LINK_OFFLOAD_XSTATS] |= IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO); skb = nlmsg_new(if_nlmsg_stats_size(dev, &response_filters), GFP_KERNEL); if (!skb) goto errout; err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, 0, 0, 0, 0, &response_filters, &idxattr, &prividx, NULL); if (err < 0) { kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_STATS, NULL, GFP_KERNEL); return; errout: rtnl_set_sk_err(net, RTNLGRP_STATS, err); } EXPORT_SYMBOL(rtnl_offload_xstats_notify); static int rtnl_stats_set(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3; struct rtnl_stats_dump_filters response_filters = {}; struct nlattr *tb[IFLA_STATS_GETSET_MAX + 1]; struct net *net = sock_net(skb->sk); struct net_device *dev = NULL; struct if_stats_msg *ifsm; bool notify = false; int err; err = rtnl_valid_stats_req(nlh, netlink_strict_get_check(skb), false, extack); if (err) return err; ifsm = nlmsg_data(nlh); if (ifsm->family != AF_UNSPEC) { NL_SET_ERR_MSG(extack, "Address family should be AF_UNSPEC"); return -EINVAL; } if (ifsm->ifindex > 0) dev = __dev_get_by_index(net, ifsm->ifindex); else return -EINVAL; if (!dev) return -ENODEV; if (ifsm->filter_mask) { NL_SET_ERR_MSG(extack, "Filter mask must be 0 for stats set"); return -EINVAL; } err = nlmsg_parse(nlh, sizeof(*ifsm), tb, IFLA_STATS_GETSET_MAX, ifla_stats_set_policy, extack); if (err < 0) return err; if (tb[IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS]) { u8 req = nla_get_u8(tb[IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS]); if (req) err = netdev_offload_xstats_enable(dev, t_l3, extack); else err = netdev_offload_xstats_disable(dev, t_l3); if (!err) notify = true; else if (err != -EALREADY) return err; response_filters.mask[0] |= IFLA_STATS_FILTER_BIT(IFLA_STATS_LINK_OFFLOAD_XSTATS); response_filters.mask[IFLA_STATS_LINK_OFFLOAD_XSTATS] |= IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO); } if (notify) rtnl_offload_xstats_notify(dev); return 0; } static int rtnl_mdb_valid_dump_req(const struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct br_port_msg *bpm; bpm = nlmsg_payload(nlh, sizeof(*bpm)); if (!bpm) { NL_SET_ERR_MSG(extack, "Invalid header for mdb dump request"); return -EINVAL; } if (bpm->ifindex) { NL_SET_ERR_MSG(extack, "Filtering by device index is not supported for mdb dump request"); return -EINVAL; } if (nlmsg_attrlen(nlh, sizeof(*bpm))) { NL_SET_ERR_MSG(extack, "Invalid data after header in mdb dump request"); return -EINVAL; } return 0; } struct rtnl_mdb_dump_ctx { long idx; }; static int rtnl_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct rtnl_mdb_dump_ctx *ctx = (void *)cb->ctx; struct net *net = sock_net(skb->sk); struct net_device *dev; int idx, s_idx; int err; NL_ASSERT_CTX_FITS(struct rtnl_mdb_dump_ctx); if (cb->strict_check) { err = rtnl_mdb_valid_dump_req(cb->nlh, cb->extack); if (err) return err; } s_idx = ctx->idx; idx = 0; for_each_netdev(net, dev) { if (idx < s_idx) goto skip; if (!dev->netdev_ops->ndo_mdb_dump) goto skip; err = dev->netdev_ops->ndo_mdb_dump(dev, skb, cb); if (err == -EMSGSIZE) goto out; /* Moving on to next device, reset markers and sequence * counters since they are all maintained per-device. */ memset(cb->ctx, 0, sizeof(cb->ctx)); cb->prev_seq = 0; cb->seq = 0; skip: idx++; } out: ctx->idx = idx; return skb->len; } static int rtnl_validate_mdb_entry_get(const struct nlattr *attr, struct netlink_ext_ack *extack) { struct br_mdb_entry *entry = nla_data(attr); if (nla_len(attr) != sizeof(struct br_mdb_entry)) { NL_SET_ERR_MSG_ATTR(extack, attr, "Invalid attribute length"); return -EINVAL; } if (entry->ifindex) { NL_SET_ERR_MSG(extack, "Entry ifindex cannot be specified"); return -EINVAL; } if (entry->state) { NL_SET_ERR_MSG(extack, "Entry state cannot be specified"); return -EINVAL; } if (entry->flags) { NL_SET_ERR_MSG(extack, "Entry flags cannot be specified"); return -EINVAL; } if (entry->vid >= VLAN_VID_MASK) { NL_SET_ERR_MSG(extack, "Invalid entry VLAN id"); return -EINVAL; } if (entry->addr.proto != htons(ETH_P_IP) && entry->addr.proto != htons(ETH_P_IPV6) && entry->addr.proto != 0) { NL_SET_ERR_MSG(extack, "Unknown entry protocol"); return -EINVAL; } return 0; } static const struct nla_policy mdba_get_policy[MDBA_GET_ENTRY_MAX + 1] = { [MDBA_GET_ENTRY] = NLA_POLICY_VALIDATE_FN(NLA_BINARY, rtnl_validate_mdb_entry_get, sizeof(struct br_mdb_entry)), [MDBA_GET_ENTRY_ATTRS] = { .type = NLA_NESTED }, }; static int rtnl_mdb_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[MDBA_GET_ENTRY_MAX + 1]; struct net *net = sock_net(in_skb->sk); struct br_port_msg *bpm; struct net_device *dev; int err; err = nlmsg_parse(nlh, sizeof(struct br_port_msg), tb, MDBA_GET_ENTRY_MAX, mdba_get_policy, extack); if (err) return err; bpm = nlmsg_data(nlh); if (!bpm->ifindex) { NL_SET_ERR_MSG(extack, "Invalid ifindex"); return -EINVAL; } dev = __dev_get_by_index(net, bpm->ifindex); if (!dev) { NL_SET_ERR_MSG(extack, "Device doesn't exist"); return -ENODEV; } if (NL_REQ_ATTR_CHECK(extack, NULL, tb, MDBA_GET_ENTRY)) { NL_SET_ERR_MSG(extack, "Missing MDBA_GET_ENTRY attribute"); return -EINVAL; } if (!dev->netdev_ops->ndo_mdb_get) { NL_SET_ERR_MSG(extack, "Device does not support MDB operations"); return -EOPNOTSUPP; } return dev->netdev_ops->ndo_mdb_get(dev, tb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, extack); } static int rtnl_validate_mdb_entry(const struct nlattr *attr, struct netlink_ext_ack *extack) { struct br_mdb_entry *entry = nla_data(attr); if (nla_len(attr) != sizeof(struct br_mdb_entry)) { NL_SET_ERR_MSG_ATTR(extack, attr, "Invalid attribute length"); return -EINVAL; } if (entry->ifindex == 0) { NL_SET_ERR_MSG(extack, "Zero entry ifindex is not allowed"); return -EINVAL; } if (entry->addr.proto == htons(ETH_P_IP)) { if (!ipv4_is_multicast(entry->addr.u.ip4) && !ipv4_is_zeronet(entry->addr.u.ip4)) { NL_SET_ERR_MSG(extack, "IPv4 entry group address is not multicast or 0.0.0.0"); return -EINVAL; } if (ipv4_is_local_multicast(entry->addr.u.ip4)) { NL_SET_ERR_MSG(extack, "IPv4 entry group address is local multicast"); return -EINVAL; } #if IS_ENABLED(CONFIG_IPV6) } else if (entry->addr.proto == htons(ETH_P_IPV6)) { if (ipv6_addr_is_ll_all_nodes(&entry->addr.u.ip6)) { NL_SET_ERR_MSG(extack, "IPv6 entry group address is link-local all nodes"); return -EINVAL; } #endif } else if (entry->addr.proto == 0) { /* L2 mdb */ if (!is_multicast_ether_addr(entry->addr.u.mac_addr)) { NL_SET_ERR_MSG(extack, "L2 entry group is not multicast"); return -EINVAL; } } else { NL_SET_ERR_MSG(extack, "Unknown entry protocol"); return -EINVAL; } if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY) { NL_SET_ERR_MSG(extack, "Unknown entry state"); return -EINVAL; } if (entry->vid >= VLAN_VID_MASK) { NL_SET_ERR_MSG(extack, "Invalid entry VLAN id"); return -EINVAL; } return 0; } static const struct nla_policy mdba_policy[MDBA_SET_ENTRY_MAX + 1] = { [MDBA_SET_ENTRY_UNSPEC] = { .strict_start_type = MDBA_SET_ENTRY_ATTRS + 1 }, [MDBA_SET_ENTRY] = NLA_POLICY_VALIDATE_FN(NLA_BINARY, rtnl_validate_mdb_entry, sizeof(struct br_mdb_entry)), [MDBA_SET_ENTRY_ATTRS] = { .type = NLA_NESTED }, }; static int rtnl_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[MDBA_SET_ENTRY_MAX + 1]; struct net *net = sock_net(skb->sk); struct br_port_msg *bpm; struct net_device *dev; int err; err = nlmsg_parse_deprecated(nlh, sizeof(*bpm), tb, MDBA_SET_ENTRY_MAX, mdba_policy, extack); if (err) return err; bpm = nlmsg_data(nlh); if (!bpm->ifindex) { NL_SET_ERR_MSG(extack, "Invalid ifindex"); return -EINVAL; } dev = __dev_get_by_index(net, bpm->ifindex); if (!dev) { NL_SET_ERR_MSG(extack, "Device doesn't exist"); return -ENODEV; } if (NL_REQ_ATTR_CHECK(extack, NULL, tb, MDBA_SET_ENTRY)) { NL_SET_ERR_MSG(extack, "Missing MDBA_SET_ENTRY attribute"); return -EINVAL; } if (!dev->netdev_ops->ndo_mdb_add) { NL_SET_ERR_MSG(extack, "Device does not support MDB operations"); return -EOPNOTSUPP; } return dev->netdev_ops->ndo_mdb_add(dev, tb, nlh->nlmsg_flags, extack); } static int rtnl_validate_mdb_entry_del_bulk(const struct nlattr *attr, struct netlink_ext_ack *extack) { struct br_mdb_entry *entry = nla_data(attr); struct br_mdb_entry zero_entry = {}; if (nla_len(attr) != sizeof(struct br_mdb_entry)) { NL_SET_ERR_MSG_ATTR(extack, attr, "Invalid attribute length"); return -EINVAL; } if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY) { NL_SET_ERR_MSG(extack, "Unknown entry state"); return -EINVAL; } if (entry->flags) { NL_SET_ERR_MSG(extack, "Entry flags cannot be set"); return -EINVAL; } if (entry->vid >= VLAN_N_VID - 1) { NL_SET_ERR_MSG(extack, "Invalid entry VLAN id"); return -EINVAL; } if (memcmp(&entry->addr, &zero_entry.addr, sizeof(entry->addr))) { NL_SET_ERR_MSG(extack, "Entry address cannot be set"); return -EINVAL; } return 0; } static const struct nla_policy mdba_del_bulk_policy[MDBA_SET_ENTRY_MAX + 1] = { [MDBA_SET_ENTRY] = NLA_POLICY_VALIDATE_FN(NLA_BINARY, rtnl_validate_mdb_entry_del_bulk, sizeof(struct br_mdb_entry)), [MDBA_SET_ENTRY_ATTRS] = { .type = NLA_NESTED }, }; static int rtnl_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { bool del_bulk = !!(nlh->nlmsg_flags & NLM_F_BULK); struct nlattr *tb[MDBA_SET_ENTRY_MAX + 1]; struct net *net = sock_net(skb->sk); struct br_port_msg *bpm; struct net_device *dev; int err; if (!del_bulk) err = nlmsg_parse_deprecated(nlh, sizeof(*bpm), tb, MDBA_SET_ENTRY_MAX, mdba_policy, extack); else err = nlmsg_parse(nlh, sizeof(*bpm), tb, MDBA_SET_ENTRY_MAX, mdba_del_bulk_policy, extack); if (err) return err; bpm = nlmsg_data(nlh); if (!bpm->ifindex) { NL_SET_ERR_MSG(extack, "Invalid ifindex"); return -EINVAL; } dev = __dev_get_by_index(net, bpm->ifindex); if (!dev) { NL_SET_ERR_MSG(extack, "Device doesn't exist"); return -ENODEV; } if (NL_REQ_ATTR_CHECK(extack, NULL, tb, MDBA_SET_ENTRY)) { NL_SET_ERR_MSG(extack, "Missing MDBA_SET_ENTRY attribute"); return -EINVAL; } if (del_bulk) { if (!dev->netdev_ops->ndo_mdb_del_bulk) { NL_SET_ERR_MSG(extack, "Device does not support MDB bulk deletion"); return -EOPNOTSUPP; } return dev->netdev_ops->ndo_mdb_del_bulk(dev, tb, extack); } if (!dev->netdev_ops->ndo_mdb_del) { NL_SET_ERR_MSG(extack, "Device does not support MDB operations"); return -EOPNOTSUPP; } return dev->netdev_ops->ndo_mdb_del(dev, tb, extack); } /* Process one rtnetlink message. */ static int rtnl_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { const bool needs_lock = !(cb->flags & RTNL_FLAG_DUMP_UNLOCKED); rtnl_dumpit_func dumpit = cb->data; int err; /* Previous iteration have already finished, avoid calling->dumpit() * again, it may not expect to be called after it reached the end. */ if (!dumpit) return 0; if (needs_lock) rtnl_lock(); err = dumpit(skb, cb); if (needs_lock) rtnl_unlock(); /* Old dump handlers used to send NLM_DONE as in a separate recvmsg(). * Some applications which parse netlink manually depend on this. */ if (cb->flags & RTNL_FLAG_DUMP_SPLIT_NLM_DONE) { if (err < 0 && err != -EMSGSIZE) return err; if (!err) cb->data = NULL; return skb->len; } return err; } static int rtnetlink_dump_start(struct sock *ssk, struct sk_buff *skb, const struct nlmsghdr *nlh, struct netlink_dump_control *control) { if (control->flags & RTNL_FLAG_DUMP_SPLIT_NLM_DONE || !(control->flags & RTNL_FLAG_DUMP_UNLOCKED)) { WARN_ON(control->data); control->data = control->dump; control->dump = rtnl_dumpit; } return netlink_dump_start(ssk, skb, nlh, control); } static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct rtnl_link *link; enum rtnl_kinds kind; struct module *owner; int err = -EOPNOTSUPP; rtnl_doit_func doit; unsigned int flags; int family; int type; type = nlh->nlmsg_type; if (type > RTM_MAX) return -EOPNOTSUPP; type -= RTM_BASE; /* All the messages must have at least 1 byte length */ if (nlmsg_len(nlh) < sizeof(struct rtgenmsg)) return 0; family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family; kind = rtnl_msgtype_kind(type); if (kind != RTNL_KIND_GET && !netlink_net_capable(skb, CAP_NET_ADMIN)) return -EPERM; rcu_read_lock(); if (kind == RTNL_KIND_GET && (nlh->nlmsg_flags & NLM_F_DUMP)) { struct sock *rtnl; rtnl_dumpit_func dumpit; u32 min_dump_alloc = 0; link = rtnl_get_link(family, type); if (!link || !link->dumpit) { family = PF_UNSPEC; link = rtnl_get_link(family, type); if (!link || !link->dumpit) goto err_unlock; } owner = link->owner; dumpit = link->dumpit; flags = link->flags; if (type == RTM_GETLINK - RTM_BASE) min_dump_alloc = rtnl_calcit(skb, nlh); err = 0; /* need to do this before rcu_read_unlock() */ if (!try_module_get(owner)) err = -EPROTONOSUPPORT; rcu_read_unlock(); rtnl = net->rtnl; if (err == 0) { struct netlink_dump_control c = { .dump = dumpit, .min_dump_alloc = min_dump_alloc, .module = owner, .flags = flags, }; err = rtnetlink_dump_start(rtnl, skb, nlh, &c); /* netlink_dump_start() will keep a reference on * module if dump is still in progress. */ module_put(owner); } return err; } link = rtnl_get_link(family, type); if (!link || !link->doit) { family = PF_UNSPEC; link = rtnl_get_link(PF_UNSPEC, type); if (!link || !link->doit) goto out_unlock; } owner = link->owner; if (!try_module_get(owner)) { err = -EPROTONOSUPPORT; goto out_unlock; } flags = link->flags; if (kind == RTNL_KIND_DEL && (nlh->nlmsg_flags & NLM_F_BULK) && !(flags & RTNL_FLAG_BULK_DEL_SUPPORTED)) { NL_SET_ERR_MSG(extack, "Bulk delete is not supported"); module_put(owner); goto err_unlock; } if (flags & RTNL_FLAG_DOIT_UNLOCKED) { doit = link->doit; rcu_read_unlock(); if (doit) err = doit(skb, nlh, extack); module_put(owner); return err; } rcu_read_unlock(); rtnl_lock(); link = rtnl_get_link(family, type); if (link && link->doit) err = link->doit(skb, nlh, extack); rtnl_unlock(); module_put(owner); return err; out_unlock: rcu_read_unlock(); return err; err_unlock: rcu_read_unlock(); return -EOPNOTSUPP; } static void rtnetlink_rcv(struct sk_buff *skb) { netlink_rcv_skb(skb, &rtnetlink_rcv_msg); } static int rtnetlink_bind(struct net *net, int group) { switch (group) { case RTNLGRP_IPV4_MROUTE_R: case RTNLGRP_IPV6_MROUTE_R: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; break; } return 0; } static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); switch (event) { case NETDEV_REBOOT: case NETDEV_CHANGEMTU: case NETDEV_CHANGEADDR: case NETDEV_CHANGENAME: case NETDEV_FEAT_CHANGE: case NETDEV_BONDING_FAILOVER: case NETDEV_POST_TYPE_CHANGE: case NETDEV_NOTIFY_PEERS: case NETDEV_CHANGEUPPER: case NETDEV_RESEND_IGMP: case NETDEV_CHANGEINFODATA: case NETDEV_CHANGELOWERSTATE: case NETDEV_CHANGE_TX_QUEUE_LEN: rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event), GFP_KERNEL, NULL, 0, 0, NULL); break; default: break; } return NOTIFY_DONE; } static struct notifier_block rtnetlink_dev_notifier = { .notifier_call = rtnetlink_event, }; static int __net_init rtnetlink_net_init(struct net *net) { struct sock *sk; struct netlink_kernel_cfg cfg = { .groups = RTNLGRP_MAX, .input = rtnetlink_rcv, .flags = NL_CFG_F_NONROOT_RECV, .bind = rtnetlink_bind, }; sk = netlink_kernel_create(net, NETLINK_ROUTE, &cfg); if (!sk) return -ENOMEM; net->rtnl = sk; return 0; } static void __net_exit rtnetlink_net_exit(struct net *net) { netlink_kernel_release(net->rtnl); net->rtnl = NULL; } static struct pernet_operations rtnetlink_net_ops = { .init = rtnetlink_net_init, .exit = rtnetlink_net_exit, }; static const struct rtnl_msg_handler rtnetlink_rtnl_msg_handlers[] __initconst = { {.msgtype = RTM_NEWLINK, .doit = rtnl_newlink, .flags = RTNL_FLAG_DOIT_PERNET}, {.msgtype = RTM_DELLINK, .doit = rtnl_dellink, .flags = RTNL_FLAG_DOIT_PERNET_WIP}, {.msgtype = RTM_GETLINK, .doit = rtnl_getlink, .dumpit = rtnl_dump_ifinfo, .flags = RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, {.msgtype = RTM_SETLINK, .doit = rtnl_setlink, .flags = RTNL_FLAG_DOIT_PERNET_WIP}, {.msgtype = RTM_GETADDR, .dumpit = rtnl_dump_all}, {.msgtype = RTM_GETROUTE, .dumpit = rtnl_dump_all}, {.msgtype = RTM_GETNETCONF, .dumpit = rtnl_dump_all}, {.msgtype = RTM_GETSTATS, .doit = rtnl_stats_get, .dumpit = rtnl_stats_dump}, {.msgtype = RTM_SETSTATS, .doit = rtnl_stats_set}, {.msgtype = RTM_NEWLINKPROP, .doit = rtnl_newlinkprop}, {.msgtype = RTM_DELLINKPROP, .doit = rtnl_dellinkprop}, {.protocol = PF_BRIDGE, .msgtype = RTM_GETLINK, .dumpit = rtnl_bridge_getlink}, {.protocol = PF_BRIDGE, .msgtype = RTM_DELLINK, .doit = rtnl_bridge_dellink}, {.protocol = PF_BRIDGE, .msgtype = RTM_SETLINK, .doit = rtnl_bridge_setlink}, {.protocol = PF_BRIDGE, .msgtype = RTM_NEWNEIGH, .doit = rtnl_fdb_add}, {.protocol = PF_BRIDGE, .msgtype = RTM_DELNEIGH, .doit = rtnl_fdb_del, .flags = RTNL_FLAG_BULK_DEL_SUPPORTED}, {.protocol = PF_BRIDGE, .msgtype = RTM_GETNEIGH, .doit = rtnl_fdb_get, .dumpit = rtnl_fdb_dump}, {.protocol = PF_BRIDGE, .msgtype = RTM_NEWMDB, .doit = rtnl_mdb_add}, {.protocol = PF_BRIDGE, .msgtype = RTM_DELMDB, .doit = rtnl_mdb_del, .flags = RTNL_FLAG_BULK_DEL_SUPPORTED}, {.protocol = PF_BRIDGE, .msgtype = RTM_GETMDB, .doit = rtnl_mdb_get, .dumpit = rtnl_mdb_dump}, }; void __init rtnetlink_init(void) { if (register_pernet_subsys(&rtnetlink_net_ops)) panic("rtnetlink_init: cannot initialize rtnetlink\n"); register_netdevice_notifier(&rtnetlink_dev_notifier); rtnl_register_many(rtnetlink_rtnl_msg_handlers); }
1 1 6 2 1 1 2 2 1 1 1 1 2 2 2 2 5 1 4 2 2 1 2 2 15 15 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 // SPDX-License-Identifier: GPL-2.0-or-later /* * NetLabel CALIPSO/IPv6 Support * * This file defines the CALIPSO/IPv6 functions for the NetLabel system. The * NetLabel system manages static and dynamic label mappings for network * protocols such as CIPSO and CALIPSO. * * Authors: Paul Moore <paul@paul-moore.com> * Huw Davies <huw@codeweavers.com> */ /* (c) Copyright Hewlett-Packard Development Company, L.P., 2006 * (c) Copyright Huw Davies <huw@codeweavers.com>, 2015 */ #include <linux/types.h> #include <linux/socket.h> #include <linux/string.h> #include <linux/skbuff.h> #include <linux/audit.h> #include <linux/slab.h> #include <net/sock.h> #include <net/netlink.h> #include <net/genetlink.h> #include <net/netlabel.h> #include <net/calipso.h> #include <linux/atomic.h> #include "netlabel_user.h" #include "netlabel_calipso.h" #include "netlabel_mgmt.h" #include "netlabel_domainhash.h" /* Argument struct for calipso_doi_walk() */ struct netlbl_calipso_doiwalk_arg { struct netlink_callback *nl_cb; struct sk_buff *skb; u32 seq; }; /* Argument struct for netlbl_domhsh_walk() */ struct netlbl_domhsh_walk_arg { struct netlbl_audit *audit_info; u32 doi; }; /* NetLabel Generic NETLINK CALIPSO family */ static struct genl_family netlbl_calipso_gnl_family; /* NetLabel Netlink attribute policy */ static const struct nla_policy calipso_genl_policy[NLBL_CALIPSO_A_MAX + 1] = { [NLBL_CALIPSO_A_DOI] = { .type = NLA_U32 }, [NLBL_CALIPSO_A_MTYPE] = { .type = NLA_U32 }, }; static const struct netlbl_calipso_ops *calipso_ops; /** * netlbl_calipso_ops_register - Register the CALIPSO operations * @ops: ops to register * * Description: * Register the CALIPSO packet engine operations. * */ const struct netlbl_calipso_ops * netlbl_calipso_ops_register(const struct netlbl_calipso_ops *ops) { return xchg(&calipso_ops, ops); } EXPORT_SYMBOL(netlbl_calipso_ops_register); static const struct netlbl_calipso_ops *netlbl_calipso_ops_get(void) { return READ_ONCE(calipso_ops); } /* NetLabel Command Handlers */ /** * netlbl_calipso_add_pass - Adds a CALIPSO pass DOI definition * @info: the Generic NETLINK info block * @audit_info: NetLabel audit information * * Description: * Create a new CALIPSO_MAP_PASS DOI definition based on the given ADD message * and add it to the CALIPSO engine. Return zero on success and non-zero on * error. * */ static int netlbl_calipso_add_pass(struct genl_info *info, struct netlbl_audit *audit_info) { int ret_val; struct calipso_doi *doi_def = NULL; doi_def = kmalloc(sizeof(*doi_def), GFP_KERNEL); if (!doi_def) return -ENOMEM; doi_def->type = CALIPSO_MAP_PASS; doi_def->doi = nla_get_u32(info->attrs[NLBL_CALIPSO_A_DOI]); ret_val = calipso_doi_add(doi_def, audit_info); if (ret_val != 0) calipso_doi_free(doi_def); return ret_val; } /** * netlbl_calipso_add - Handle an ADD message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Create a new DOI definition based on the given ADD message and add it to the * CALIPSO engine. Returns zero on success, negative values on failure. * */ static int netlbl_calipso_add(struct sk_buff *skb, struct genl_info *info) { int ret_val = -EINVAL; struct netlbl_audit audit_info; const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); if (!info->attrs[NLBL_CALIPSO_A_DOI] || !info->attrs[NLBL_CALIPSO_A_MTYPE]) return -EINVAL; if (!ops) return -EOPNOTSUPP; netlbl_netlink_auditinfo(&audit_info); switch (nla_get_u32(info->attrs[NLBL_CALIPSO_A_MTYPE])) { case CALIPSO_MAP_PASS: ret_val = netlbl_calipso_add_pass(info, &audit_info); break; } if (ret_val == 0) atomic_inc(&netlabel_mgmt_protocount); return ret_val; } /** * netlbl_calipso_list - Handle a LIST message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Process a user generated LIST message and respond accordingly. * Returns zero on success and negative values on error. * */ static int netlbl_calipso_list(struct sk_buff *skb, struct genl_info *info) { int ret_val; struct sk_buff *ans_skb = NULL; void *data; u32 doi; struct calipso_doi *doi_def; if (!info->attrs[NLBL_CALIPSO_A_DOI]) { ret_val = -EINVAL; goto list_failure; } doi = nla_get_u32(info->attrs[NLBL_CALIPSO_A_DOI]); doi_def = calipso_doi_getdef(doi); if (!doi_def) { ret_val = -EINVAL; goto list_failure; } ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!ans_skb) { ret_val = -ENOMEM; goto list_failure_put; } data = genlmsg_put_reply(ans_skb, info, &netlbl_calipso_gnl_family, 0, NLBL_CALIPSO_C_LIST); if (!data) { ret_val = -ENOMEM; goto list_failure_put; } ret_val = nla_put_u32(ans_skb, NLBL_CALIPSO_A_MTYPE, doi_def->type); if (ret_val != 0) goto list_failure_put; calipso_doi_putdef(doi_def); genlmsg_end(ans_skb, data); return genlmsg_reply(ans_skb, info); list_failure_put: calipso_doi_putdef(doi_def); list_failure: kfree_skb(ans_skb); return ret_val; } /** * netlbl_calipso_listall_cb - calipso_doi_walk() callback for LISTALL * @doi_def: the CALIPSO DOI definition * @arg: the netlbl_calipso_doiwalk_arg structure * * Description: * This function is designed to be used as a callback to the * calipso_doi_walk() function for use in generating a response for a LISTALL * message. Returns the size of the message on success, negative values on * failure. * */ static int netlbl_calipso_listall_cb(struct calipso_doi *doi_def, void *arg) { int ret_val = -ENOMEM; struct netlbl_calipso_doiwalk_arg *cb_arg = arg; void *data; data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).portid, cb_arg->seq, &netlbl_calipso_gnl_family, NLM_F_MULTI, NLBL_CALIPSO_C_LISTALL); if (!data) goto listall_cb_failure; ret_val = nla_put_u32(cb_arg->skb, NLBL_CALIPSO_A_DOI, doi_def->doi); if (ret_val != 0) goto listall_cb_failure; ret_val = nla_put_u32(cb_arg->skb, NLBL_CALIPSO_A_MTYPE, doi_def->type); if (ret_val != 0) goto listall_cb_failure; genlmsg_end(cb_arg->skb, data); return 0; listall_cb_failure: genlmsg_cancel(cb_arg->skb, data); return ret_val; } /** * netlbl_calipso_listall - Handle a LISTALL message * @skb: the NETLINK buffer * @cb: the NETLINK callback * * Description: * Process a user generated LISTALL message and respond accordingly. Returns * zero on success and negative values on error. * */ static int netlbl_calipso_listall(struct sk_buff *skb, struct netlink_callback *cb) { struct netlbl_calipso_doiwalk_arg cb_arg; u32 doi_skip = cb->args[0]; cb_arg.nl_cb = cb; cb_arg.skb = skb; cb_arg.seq = cb->nlh->nlmsg_seq; calipso_doi_walk(&doi_skip, netlbl_calipso_listall_cb, &cb_arg); cb->args[0] = doi_skip; return skb->len; } /** * netlbl_calipso_remove_cb - netlbl_calipso_remove() callback for REMOVE * @entry: LSM domain mapping entry * @arg: the netlbl_domhsh_walk_arg structure * * Description: * This function is intended for use by netlbl_calipso_remove() as the callback * for the netlbl_domhsh_walk() function; it removes LSM domain map entries * which are associated with the CALIPSO DOI specified in @arg. Returns zero on * success, negative values on failure. * */ static int netlbl_calipso_remove_cb(struct netlbl_dom_map *entry, void *arg) { struct netlbl_domhsh_walk_arg *cb_arg = arg; if (entry->def.type == NETLBL_NLTYPE_CALIPSO && entry->def.calipso->doi == cb_arg->doi) return netlbl_domhsh_remove_entry(entry, cb_arg->audit_info); return 0; } /** * netlbl_calipso_remove - Handle a REMOVE message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Process a user generated REMOVE message and respond accordingly. Returns * zero on success, negative values on failure. * */ static int netlbl_calipso_remove(struct sk_buff *skb, struct genl_info *info) { int ret_val = -EINVAL; struct netlbl_domhsh_walk_arg cb_arg; struct netlbl_audit audit_info; u32 skip_bkt = 0; u32 skip_chain = 0; if (!info->attrs[NLBL_CALIPSO_A_DOI]) return -EINVAL; netlbl_netlink_auditinfo(&audit_info); cb_arg.doi = nla_get_u32(info->attrs[NLBL_CALIPSO_A_DOI]); cb_arg.audit_info = &audit_info; ret_val = netlbl_domhsh_walk(&skip_bkt, &skip_chain, netlbl_calipso_remove_cb, &cb_arg); if (ret_val == 0 || ret_val == -ENOENT) { ret_val = calipso_doi_remove(cb_arg.doi, &audit_info); if (ret_val == 0) atomic_dec(&netlabel_mgmt_protocount); } return ret_val; } /* NetLabel Generic NETLINK Command Definitions */ static const struct genl_small_ops netlbl_calipso_ops[] = { { .cmd = NLBL_CALIPSO_C_ADD, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = netlbl_calipso_add, .dumpit = NULL, }, { .cmd = NLBL_CALIPSO_C_REMOVE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = netlbl_calipso_remove, .dumpit = NULL, }, { .cmd = NLBL_CALIPSO_C_LIST, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, .doit = netlbl_calipso_list, .dumpit = NULL, }, { .cmd = NLBL_CALIPSO_C_LISTALL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, .doit = NULL, .dumpit = netlbl_calipso_listall, }, }; static struct genl_family netlbl_calipso_gnl_family __ro_after_init = { .hdrsize = 0, .name = NETLBL_NLTYPE_CALIPSO_NAME, .version = NETLBL_PROTO_VERSION, .maxattr = NLBL_CALIPSO_A_MAX, .policy = calipso_genl_policy, .module = THIS_MODULE, .small_ops = netlbl_calipso_ops, .n_small_ops = ARRAY_SIZE(netlbl_calipso_ops), .resv_start_op = NLBL_CALIPSO_C_LISTALL + 1, }; /* NetLabel Generic NETLINK Protocol Functions */ /** * netlbl_calipso_genl_init - Register the CALIPSO NetLabel component * * Description: * Register the CALIPSO packet NetLabel component with the Generic NETLINK * mechanism. Returns zero on success, negative values on failure. * */ int __init netlbl_calipso_genl_init(void) { return genl_register_family(&netlbl_calipso_gnl_family); } /** * calipso_doi_add - Add a new DOI to the CALIPSO protocol engine * @doi_def: the DOI structure * @audit_info: NetLabel audit information * * Description: * The caller defines a new DOI for use by the CALIPSO engine and calls this * function to add it to the list of acceptable domains. The caller must * ensure that the mapping table specified in @doi_def->map meets all of the * requirements of the mapping type (see calipso.h for details). Returns * zero on success and non-zero on failure. * */ int calipso_doi_add(struct calipso_doi *doi_def, struct netlbl_audit *audit_info) { int ret_val = -ENOMSG; const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); if (ops) ret_val = ops->doi_add(doi_def, audit_info); return ret_val; } /** * calipso_doi_free - Frees a DOI definition * @doi_def: the DOI definition * * Description: * This function frees all of the memory associated with a DOI definition. * */ void calipso_doi_free(struct calipso_doi *doi_def) { const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); if (ops) ops->doi_free(doi_def); } /** * calipso_doi_remove - Remove an existing DOI from the CALIPSO protocol engine * @doi: the DOI value * @audit_info: NetLabel audit information * * Description: * Removes a DOI definition from the CALIPSO engine. The NetLabel routines will * be called to release their own LSM domain mappings as well as our own * domain list. Returns zero on success and negative values on failure. * */ int calipso_doi_remove(u32 doi, struct netlbl_audit *audit_info) { int ret_val = -ENOMSG; const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); if (ops) ret_val = ops->doi_remove(doi, audit_info); return ret_val; } /** * calipso_doi_getdef - Returns a reference to a valid DOI definition * @doi: the DOI value * * Description: * Searches for a valid DOI definition and if one is found it is returned to * the caller. Otherwise NULL is returned. The caller must ensure that * calipso_doi_putdef() is called when the caller is done. * */ struct calipso_doi *calipso_doi_getdef(u32 doi) { struct calipso_doi *ret_val = NULL; const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); if (ops) ret_val = ops->doi_getdef(doi); return ret_val; } /** * calipso_doi_putdef - Releases a reference for the given DOI definition * @doi_def: the DOI definition * * Description: * Releases a DOI definition reference obtained from calipso_doi_getdef(). * */ void calipso_doi_putdef(struct calipso_doi *doi_def) { const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); if (ops) ops->doi_putdef(doi_def); } /** * calipso_doi_walk - Iterate through the DOI definitions * @skip_cnt: skip past this number of DOI definitions, updated * @callback: callback for each DOI definition * @cb_arg: argument for the callback function * * Description: * Iterate over the DOI definition list, skipping the first @skip_cnt entries. * For each entry call @callback, if @callback returns a negative value stop * 'walking' through the list and return. Updates the value in @skip_cnt upon * return. Returns zero on success, negative values on failure. * */ int calipso_doi_walk(u32 *skip_cnt, int (*callback)(struct calipso_doi *doi_def, void *arg), void *cb_arg) { int ret_val = -ENOMSG; const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); if (ops) ret_val = ops->doi_walk(skip_cnt, callback, cb_arg); return ret_val; } /** * calipso_sock_getattr - Get the security attributes from a sock * @sk: the sock * @secattr: the security attributes * * Description: * Query @sk to see if there is a CALIPSO option attached to the sock and if * there is return the CALIPSO security attributes in @secattr. This function * requires that @sk be locked, or privately held, but it does not do any * locking itself. Returns zero on success and negative values on failure. * */ int calipso_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) { int ret_val = -ENOMSG; const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); if (ops) ret_val = ops->sock_getattr(sk, secattr); return ret_val; } /** * calipso_sock_setattr - Add a CALIPSO option to a socket * @sk: the socket * @doi_def: the CALIPSO DOI to use * @secattr: the specific security attributes of the socket * * Description: * Set the CALIPSO option on the given socket using the DOI definition and * security attributes passed to the function. This function requires * exclusive access to @sk, which means it either needs to be in the * process of being created or locked. Returns zero on success and negative * values on failure. * */ int calipso_sock_setattr(struct sock *sk, const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { int ret_val = -ENOMSG; const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); if (ops) ret_val = ops->sock_setattr(sk, doi_def, secattr); return ret_val; } /** * calipso_sock_delattr - Delete the CALIPSO option from a socket * @sk: the socket * * Description: * Removes the CALIPSO option from a socket, if present. * */ void calipso_sock_delattr(struct sock *sk) { const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); if (ops) ops->sock_delattr(sk); } /** * calipso_req_setattr - Add a CALIPSO option to a connection request socket * @req: the connection request socket * @doi_def: the CALIPSO DOI to use * @secattr: the specific security attributes of the socket * * Description: * Set the CALIPSO option on the given socket using the DOI definition and * security attributes passed to the function. Returns zero on success and * negative values on failure. * */ int calipso_req_setattr(struct request_sock *req, const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { int ret_val = -ENOMSG; const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); if (ops) ret_val = ops->req_setattr(req, doi_def, secattr); return ret_val; } /** * calipso_req_delattr - Delete the CALIPSO option from a request socket * @req: the request socket * * Description: * Removes the CALIPSO option from a request socket, if present. * */ void calipso_req_delattr(struct request_sock *req) { const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); if (ops) ops->req_delattr(req); } /** * calipso_optptr - Find the CALIPSO option in the packet * @skb: the packet * * Description: * Parse the packet's IP header looking for a CALIPSO option. Returns a pointer * to the start of the CALIPSO option on success, NULL if one if not found. * */ unsigned char *calipso_optptr(const struct sk_buff *skb) { unsigned char *ret_val = NULL; const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); if (ops) ret_val = ops->skbuff_optptr(skb); return ret_val; } /** * calipso_getattr - Get the security attributes from a memory block. * @calipso: the CALIPSO option * @secattr: the security attributes * * Description: * Inspect @calipso and return the security attributes in @secattr. * Returns zero on success and negative values on failure. * */ int calipso_getattr(const unsigned char *calipso, struct netlbl_lsm_secattr *secattr) { int ret_val = -ENOMSG; const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); if (ops) ret_val = ops->opt_getattr(calipso, secattr); return ret_val; } /** * calipso_skbuff_setattr - Set the CALIPSO option on a packet * @skb: the packet * @doi_def: the CALIPSO DOI to use * @secattr: the security attributes * * Description: * Set the CALIPSO option on the given packet based on the security attributes. * Returns a pointer to the IP header on success and NULL on failure. * */ int calipso_skbuff_setattr(struct sk_buff *skb, const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { int ret_val = -ENOMSG; const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); if (ops) ret_val = ops->skbuff_setattr(skb, doi_def, secattr); return ret_val; } /** * calipso_skbuff_delattr - Delete any CALIPSO options from a packet * @skb: the packet * * Description: * Removes any and all CALIPSO options from the given packet. Returns zero on * success, negative values on failure. * */ int calipso_skbuff_delattr(struct sk_buff *skb) { int ret_val = -ENOMSG; const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); if (ops) ret_val = ops->skbuff_delattr(skb); return ret_val; } /** * calipso_cache_invalidate - Invalidates the current CALIPSO cache * * Description: * Invalidates and frees any entries in the CALIPSO cache. Returns zero on * success and negative values on failure. * */ void calipso_cache_invalidate(void) { const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); if (ops) ops->cache_invalidate(); } /** * calipso_cache_add - Add an entry to the CALIPSO cache * @calipso_ptr: the CALIPSO option * @secattr: the packet's security attributes * * Description: * Add a new entry into the CALIPSO label mapping cache. * Returns zero on success, negative values on failure. * */ int calipso_cache_add(const unsigned char *calipso_ptr, const struct netlbl_lsm_secattr *secattr) { int ret_val = -ENOMSG; const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); if (ops) ret_val = ops->cache_add(calipso_ptr, secattr); return ret_val; }
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 // SPDX-License-Identifier: GPL-2.0 /* * IEEE 802.15.4 PAN management * * Copyright (C) 2023 Qorvo US, Inc * Authors: * - David Girault <david.girault@qorvo.com> * - Miquel Raynal <miquel.raynal@bootlin.com> */ #include <linux/kernel.h> #include <net/cfg802154.h> #include <net/af_ieee802154.h> /* Checks whether a device address matches one from the PAN list. * This helper is meant to be used only during PAN management, when we expect * extended addresses to be used. */ static bool cfg802154_pan_device_is_matching(struct ieee802154_pan_device *pan_dev, struct ieee802154_addr *ext_dev) { if (!pan_dev || !ext_dev) return false; if (ext_dev->mode == IEEE802154_ADDR_SHORT) return false; return pan_dev->extended_addr == ext_dev->extended_addr; } bool cfg802154_device_is_associated(struct wpan_dev *wpan_dev) { bool is_assoc; mutex_lock(&wpan_dev->association_lock); is_assoc = !list_empty(&wpan_dev->children) || wpan_dev->parent; mutex_unlock(&wpan_dev->association_lock); return is_assoc; } bool cfg802154_device_is_parent(struct wpan_dev *wpan_dev, struct ieee802154_addr *target) { lockdep_assert_held(&wpan_dev->association_lock); return cfg802154_pan_device_is_matching(wpan_dev->parent, target); } EXPORT_SYMBOL_GPL(cfg802154_device_is_parent); struct ieee802154_pan_device * cfg802154_device_is_child(struct wpan_dev *wpan_dev, struct ieee802154_addr *target) { struct ieee802154_pan_device *child; lockdep_assert_held(&wpan_dev->association_lock); list_for_each_entry(child, &wpan_dev->children, node) if (cfg802154_pan_device_is_matching(child, target)) return child; return NULL; } EXPORT_SYMBOL_GPL(cfg802154_device_is_child); __le16 cfg802154_get_free_short_addr(struct wpan_dev *wpan_dev) { struct ieee802154_pan_device *child; __le16 addr; lockdep_assert_held(&wpan_dev->association_lock); do { get_random_bytes(&addr, 2); if (addr == cpu_to_le16(IEEE802154_ADDR_SHORT_BROADCAST) || addr == cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC)) continue; if (wpan_dev->short_addr == addr) continue; if (wpan_dev->parent && wpan_dev->parent->short_addr == addr) continue; list_for_each_entry(child, &wpan_dev->children, node) if (child->short_addr == addr) continue; break; } while (1); return addr; } EXPORT_SYMBOL_GPL(cfg802154_get_free_short_addr); unsigned int cfg802154_set_max_associations(struct wpan_dev *wpan_dev, unsigned int max) { unsigned int old_max; lockdep_assert_held(&wpan_dev->association_lock); old_max = wpan_dev->max_associations; wpan_dev->max_associations = max; return old_max; } EXPORT_SYMBOL_GPL(cfg802154_set_max_associations);
7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 /* SPDX-License-Identifier: GPL-2.0-only */ /* * File: pep.h * * Phonet Pipe End Point sockets definitions * * Copyright (C) 2008 Nokia Corporation. */ #ifndef NET_PHONET_PEP_H #define NET_PHONET_PEP_H #include <linux/skbuff.h> #include <net/phonet/phonet.h> struct pep_sock { struct pn_sock pn_sk; /* XXX: union-ify listening vs connected stuff ? */ /* Listening socket stuff: */ struct hlist_head hlist; /* Connected socket stuff: */ struct sock *listener; struct sk_buff_head ctrlreq_queue; #define PNPIPE_CTRLREQ_MAX 10 atomic_t tx_credits; int ifindex; u16 peer_type; /* peer type/subtype */ u8 pipe_handle; u8 rx_credits; u8 rx_fc; /* RX flow control */ u8 tx_fc; /* TX flow control */ u8 init_enable; /* auto-enable at creation */ u8 aligned; }; static inline struct pep_sock *pep_sk(struct sock *sk) { return (struct pep_sock *)sk; } extern const struct proto_ops phonet_stream_ops; /* Pipe protocol definitions */ struct pnpipehdr { u8 utid; /* transaction ID */ u8 message_id; u8 pipe_handle; union { u8 state_after_connect; /* connect request */ u8 state_after_reset; /* reset request */ u8 error_code; /* any response */ u8 pep_type; /* status indication */ u8 data0; /* anything else */ }; u8 data[]; }; #define other_pep_type data[0] static inline struct pnpipehdr *pnp_hdr(struct sk_buff *skb) { return (struct pnpipehdr *)skb_transport_header(skb); } #define MAX_PNPIPE_HEADER (MAX_PHONET_HEADER + 4) enum { PNS_PIPE_CREATE_REQ = 0x00, PNS_PIPE_CREATE_RESP, PNS_PIPE_REMOVE_REQ, PNS_PIPE_REMOVE_RESP, PNS_PIPE_DATA = 0x20, PNS_PIPE_ALIGNED_DATA, PNS_PEP_CONNECT_REQ = 0x40, PNS_PEP_CONNECT_RESP, PNS_PEP_DISCONNECT_REQ, PNS_PEP_DISCONNECT_RESP, PNS_PEP_RESET_REQ, PNS_PEP_RESET_RESP, PNS_PEP_ENABLE_REQ, PNS_PEP_ENABLE_RESP, PNS_PEP_CTRL_REQ, PNS_PEP_CTRL_RESP, PNS_PEP_DISABLE_REQ = 0x4C, PNS_PEP_DISABLE_RESP, PNS_PEP_STATUS_IND = 0x60, PNS_PIPE_CREATED_IND, PNS_PIPE_RESET_IND = 0x63, PNS_PIPE_ENABLED_IND, PNS_PIPE_REDIRECTED_IND, PNS_PIPE_DISABLED_IND = 0x66, }; #define PN_PIPE_INVALID_HANDLE 0xff #define PN_PEP_TYPE_COMMON 0x00 /* Phonet pipe status indication */ enum { PN_PEP_IND_FLOW_CONTROL, PN_PEP_IND_ID_MCFC_GRANT_CREDITS, }; /* Phonet pipe error codes */ enum { PN_PIPE_NO_ERROR, PN_PIPE_ERR_INVALID_PARAM, PN_PIPE_ERR_INVALID_HANDLE, PN_PIPE_ERR_INVALID_CTRL_ID, PN_PIPE_ERR_NOT_ALLOWED, PN_PIPE_ERR_PEP_IN_USE, PN_PIPE_ERR_OVERLOAD, PN_PIPE_ERR_DEV_DISCONNECTED, PN_PIPE_ERR_TIMEOUT, PN_PIPE_ERR_ALL_PIPES_IN_USE, PN_PIPE_ERR_GENERAL, PN_PIPE_ERR_NOT_SUPPORTED, }; /* Phonet pipe states */ enum { PN_PIPE_DISABLE, PN_PIPE_ENABLE, }; /* Phonet pipe sub-block types */ enum { PN_PIPE_SB_CREATE_REQ_PEP_SUB_TYPE, PN_PIPE_SB_CONNECT_REQ_PEP_SUB_TYPE, PN_PIPE_SB_REDIRECT_REQ_PEP_SUB_TYPE, PN_PIPE_SB_NEGOTIATED_FC, PN_PIPE_SB_REQUIRED_FC_TX, PN_PIPE_SB_PREFERRED_FC_RX, PN_PIPE_SB_ALIGNED_DATA, }; /* Phonet pipe flow control models */ enum { PN_NO_FLOW_CONTROL, PN_LEGACY_FLOW_CONTROL, PN_ONE_CREDIT_FLOW_CONTROL, PN_MULTI_CREDIT_FLOW_CONTROL, PN_MAX_FLOW_CONTROL, }; #define pn_flow_safe(fc) ((fc) >> 1) /* Phonet pipe flow control states */ enum { PEP_IND_EMPTY, PEP_IND_BUSY, PEP_IND_READY, }; #endif
20 13 11 19 4 18 20 12 12 12 4 11 10 11 18 8 19 8 19 19 12 8 20 20 9 12 8 20 8 8 8 8 8 8 8 8 8 8 17 20 20 8 19 12 4 8 4 8 12 12 4 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 // SPDX-License-Identifier: GPL-2.0-or-later /* Keyring handling * * Copyright (C) 2004-2005, 2008, 2013 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/export.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/security.h> #include <linux/seq_file.h> #include <linux/err.h> #include <linux/user_namespace.h> #include <linux/nsproxy.h> #include <keys/keyring-type.h> #include <keys/user-type.h> #include <linux/assoc_array_priv.h> #include <linux/uaccess.h> #include <net/net_namespace.h> #include "internal.h" /* * When plumbing the depths of the key tree, this sets a hard limit * set on how deep we're willing to go. */ #define KEYRING_SEARCH_MAX_DEPTH 6 /* * We mark pointers we pass to the associative array with bit 1 set if * they're keyrings and clear otherwise. */ #define KEYRING_PTR_SUBTYPE 0x2UL static inline bool keyring_ptr_is_keyring(const struct assoc_array_ptr *x) { return (unsigned long)x & KEYRING_PTR_SUBTYPE; } static inline struct key *keyring_ptr_to_key(const struct assoc_array_ptr *x) { void *object = assoc_array_ptr_to_leaf(x); return (struct key *)((unsigned long)object & ~KEYRING_PTR_SUBTYPE); } static inline void *keyring_key_to_ptr(struct key *key) { if (key->type == &key_type_keyring) return (void *)((unsigned long)key | KEYRING_PTR_SUBTYPE); return key; } static DEFINE_RWLOCK(keyring_name_lock); /* * Clean up the bits of user_namespace that belong to us. */ void key_free_user_ns(struct user_namespace *ns) { write_lock(&keyring_name_lock); list_del_init(&ns->keyring_name_list); write_unlock(&keyring_name_lock); key_put(ns->user_keyring_register); #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif } /* * The keyring key type definition. Keyrings are simply keys of this type and * can be treated as ordinary keys in addition to having their own special * operations. */ static int keyring_preparse(struct key_preparsed_payload *prep); static void keyring_free_preparse(struct key_preparsed_payload *prep); static int keyring_instantiate(struct key *keyring, struct key_preparsed_payload *prep); static void keyring_revoke(struct key *keyring); static void keyring_destroy(struct key *keyring); static void keyring_describe(const struct key *keyring, struct seq_file *m); static long keyring_read(const struct key *keyring, char *buffer, size_t buflen); struct key_type key_type_keyring = { .name = "keyring", .def_datalen = 0, .preparse = keyring_preparse, .free_preparse = keyring_free_preparse, .instantiate = keyring_instantiate, .revoke = keyring_revoke, .destroy = keyring_destroy, .describe = keyring_describe, .read = keyring_read, }; EXPORT_SYMBOL(key_type_keyring); /* * Semaphore to serialise link/link calls to prevent two link calls in parallel * introducing a cycle. */ static DEFINE_MUTEX(keyring_serialise_link_lock); /* * Publish the name of a keyring so that it can be found by name (if it has * one and it doesn't begin with a dot). */ static void keyring_publish_name(struct key *keyring) { struct user_namespace *ns = current_user_ns(); if (keyring->description && keyring->description[0] && keyring->description[0] != '.') { write_lock(&keyring_name_lock); list_add_tail(&keyring->name_link, &ns->keyring_name_list); write_unlock(&keyring_name_lock); } } /* * Preparse a keyring payload */ static int keyring_preparse(struct key_preparsed_payload *prep) { return prep->datalen != 0 ? -EINVAL : 0; } /* * Free a preparse of a user defined key payload */ static void keyring_free_preparse(struct key_preparsed_payload *prep) { } /* * Initialise a keyring. * * Returns 0 on success, -EINVAL if given any data. */ static int keyring_instantiate(struct key *keyring, struct key_preparsed_payload *prep) { assoc_array_init(&keyring->keys); /* make the keyring available by name if it has one */ keyring_publish_name(keyring); return 0; } /* * Multiply 64-bits by 32-bits to 96-bits and fold back to 64-bit. Ideally we'd * fold the carry back too, but that requires inline asm. */ static u64 mult_64x32_and_fold(u64 x, u32 y) { u64 hi = (u64)(u32)(x >> 32) * y; u64 lo = (u64)(u32)(x) * y; return lo + ((u64)(u32)hi << 32) + (u32)(hi >> 32); } /* * Hash a key type and description. */ static void hash_key_type_and_desc(struct keyring_index_key *index_key) { const unsigned level_shift = ASSOC_ARRAY_LEVEL_STEP; const unsigned long fan_mask = ASSOC_ARRAY_FAN_MASK; const char *description = index_key->description; unsigned long hash, type; u32 piece; u64 acc; int n, desc_len = index_key->desc_len; type = (unsigned long)index_key->type; acc = mult_64x32_and_fold(type, desc_len + 13); acc = mult_64x32_and_fold(acc, 9207); piece = (unsigned long)index_key->domain_tag; acc = mult_64x32_and_fold(acc, piece); acc = mult_64x32_and_fold(acc, 9207); for (;;) { n = desc_len; if (n <= 0) break; if (n > 4) n = 4; piece = 0; memcpy(&piece, description, n); description += n; desc_len -= n; acc = mult_64x32_and_fold(acc, piece); acc = mult_64x32_and_fold(acc, 9207); } /* Fold the hash down to 32 bits if need be. */ hash = acc; if (ASSOC_ARRAY_KEY_CHUNK_SIZE == 32) hash ^= acc >> 32; /* Squidge all the keyrings into a separate part of the tree to * ordinary keys by making sure the lowest level segment in the hash is * zero for keyrings and non-zero otherwise. */ if (index_key->type != &key_type_keyring && (hash & fan_mask) == 0) hash |= (hash >> (ASSOC_ARRAY_KEY_CHUNK_SIZE - level_shift)) | 1; else if (index_key->type == &key_type_keyring && (hash & fan_mask) != 0) hash = (hash + (hash << level_shift)) & ~fan_mask; index_key->hash = hash; } /* * Finalise an index key to include a part of the description actually in the * index key, to set the domain tag and to calculate the hash. */ void key_set_index_key(struct keyring_index_key *index_key) { static struct key_tag default_domain_tag = { .usage = REFCOUNT_INIT(1), }; size_t n = min_t(size_t, index_key->desc_len, sizeof(index_key->desc)); memcpy(index_key->desc, index_key->description, n); if (!index_key->domain_tag) { if (index_key->type->flags & KEY_TYPE_NET_DOMAIN) index_key->domain_tag = current->nsproxy->net_ns->key_domain; else index_key->domain_tag = &default_domain_tag; } hash_key_type_and_desc(index_key); } /** * key_put_tag - Release a ref on a tag. * @tag: The tag to release. * * This releases a reference the given tag and returns true if that ref was the * last one. */ bool key_put_tag(struct key_tag *tag) { if (refcount_dec_and_test(&tag->usage)) { kfree_rcu(tag, rcu); return true; } return false; } /** * key_remove_domain - Kill off a key domain and gc its keys * @domain_tag: The domain tag to release. * * This marks a domain tag as being dead and releases a ref on it. If that * wasn't the last reference, the garbage collector is poked to try and delete * all keys that were in the domain. */ void key_remove_domain(struct key_tag *domain_tag) { domain_tag->removed = true; if (!key_put_tag(domain_tag)) key_schedule_gc_links(); } /* * Build the next index key chunk. * * We return it one word-sized chunk at a time. */ static unsigned long keyring_get_key_chunk(const void *data, int level) { const struct keyring_index_key *index_key = data; unsigned long chunk = 0; const u8 *d; int desc_len = index_key->desc_len, n = sizeof(chunk); level /= ASSOC_ARRAY_KEY_CHUNK_SIZE; switch (level) { case 0: return index_key->hash; case 1: return index_key->x; case 2: return (unsigned long)index_key->type; case 3: return (unsigned long)index_key->domain_tag; default: level -= 4; if (desc_len <= sizeof(index_key->desc)) return 0; d = index_key->description + sizeof(index_key->desc); d += level * sizeof(long); desc_len -= sizeof(index_key->desc); if (desc_len > n) desc_len = n; do { chunk <<= 8; chunk |= *d++; } while (--desc_len > 0); return chunk; } } static unsigned long keyring_get_object_key_chunk(const void *object, int level) { const struct key *key = keyring_ptr_to_key(object); return keyring_get_key_chunk(&key->index_key, level); } static bool keyring_compare_object(const void *object, const void *data) { const struct keyring_index_key *index_key = data; const struct key *key = keyring_ptr_to_key(object); return key->index_key.type == index_key->type && key->index_key.domain_tag == index_key->domain_tag && key->index_key.desc_len == index_key->desc_len && memcmp(key->index_key.description, index_key->description, index_key->desc_len) == 0; } /* * Compare the index keys of a pair of objects and determine the bit position * at which they differ - if they differ. */ static int keyring_diff_objects(const void *object, const void *data) { const struct key *key_a = keyring_ptr_to_key(object); const struct keyring_index_key *a = &key_a->index_key; const struct keyring_index_key *b = data; unsigned long seg_a, seg_b; int level, i; level = 0; seg_a = a->hash; seg_b = b->hash; if ((seg_a ^ seg_b) != 0) goto differ; level += ASSOC_ARRAY_KEY_CHUNK_SIZE / 8; /* The number of bits contributed by the hash is controlled by a * constant in the assoc_array headers. Everything else thereafter we * can deal with as being machine word-size dependent. */ seg_a = a->x; seg_b = b->x; if ((seg_a ^ seg_b) != 0) goto differ; level += sizeof(unsigned long); /* The next bit may not work on big endian */ seg_a = (unsigned long)a->type; seg_b = (unsigned long)b->type; if ((seg_a ^ seg_b) != 0) goto differ; level += sizeof(unsigned long); seg_a = (unsigned long)a->domain_tag; seg_b = (unsigned long)b->domain_tag; if ((seg_a ^ seg_b) != 0) goto differ; level += sizeof(unsigned long); i = sizeof(a->desc); if (a->desc_len <= i) goto same; for (; i < a->desc_len; i++) { seg_a = *(unsigned char *)(a->description + i); seg_b = *(unsigned char *)(b->description + i); if ((seg_a ^ seg_b) != 0) goto differ_plus_i; } same: return -1; differ_plus_i: level += i; differ: i = level * 8 + __ffs(seg_a ^ seg_b); return i; } /* * Free an object after stripping the keyring flag off of the pointer. */ static void keyring_free_object(void *object) { key_put(keyring_ptr_to_key(object)); } /* * Operations for keyring management by the index-tree routines. */ static const struct assoc_array_ops keyring_assoc_array_ops = { .get_key_chunk = keyring_get_key_chunk, .get_object_key_chunk = keyring_get_object_key_chunk, .compare_object = keyring_compare_object, .diff_objects = keyring_diff_objects, .free_object = keyring_free_object, }; /* * Clean up a keyring when it is destroyed. Unpublish its name if it had one * and dispose of its data. * * The garbage collector detects the final key_put(), removes the keyring from * the serial number tree and then does RCU synchronisation before coming here, * so we shouldn't need to worry about code poking around here with the RCU * readlock held by this time. */ static void keyring_destroy(struct key *keyring) { if (keyring->description) { write_lock(&keyring_name_lock); if (keyring->name_link.next != NULL && !list_empty(&keyring->name_link)) list_del(&keyring->name_link); write_unlock(&keyring_name_lock); } if (keyring->restrict_link) { struct key_restriction *keyres = keyring->restrict_link; key_put(keyres->key); kfree(keyres); } assoc_array_destroy(&keyring->keys, &keyring_assoc_array_ops); } /* * Describe a keyring for /proc. */ static void keyring_describe(const struct key *keyring, struct seq_file *m) { if (keyring->description) seq_puts(m, keyring->description); else seq_puts(m, "[anon]"); if (key_is_positive(keyring)) { if (keyring->keys.nr_leaves_on_tree != 0) seq_printf(m, ": %lu", keyring->keys.nr_leaves_on_tree); else seq_puts(m, ": empty"); } } struct keyring_read_iterator_context { size_t buflen; size_t count; key_serial_t *buffer; }; static int keyring_read_iterator(const void *object, void *data) { struct keyring_read_iterator_context *ctx = data; const struct key *key = keyring_ptr_to_key(object); kenter("{%s,%d},,{%zu/%zu}", key->type->name, key->serial, ctx->count, ctx->buflen); if (ctx->count >= ctx->buflen) return 1; *ctx->buffer++ = key->serial; ctx->count += sizeof(key->serial); return 0; } /* * Read a list of key IDs from the keyring's contents in binary form * * The keyring's semaphore is read-locked by the caller. This prevents someone * from modifying it under us - which could cause us to read key IDs multiple * times. */ static long keyring_read(const struct key *keyring, char *buffer, size_t buflen) { struct keyring_read_iterator_context ctx; long ret; kenter("{%d},,%zu", key_serial(keyring), buflen); if (buflen & (sizeof(key_serial_t) - 1)) return -EINVAL; /* Copy as many key IDs as fit into the buffer */ if (buffer && buflen) { ctx.buffer = (key_serial_t *)buffer; ctx.buflen = buflen; ctx.count = 0; ret = assoc_array_iterate(&keyring->keys, keyring_read_iterator, &ctx); if (ret < 0) { kleave(" = %ld [iterate]", ret); return ret; } } /* Return the size of the buffer needed */ ret = keyring->keys.nr_leaves_on_tree * sizeof(key_serial_t); if (ret <= buflen) kleave("= %ld [ok]", ret); else kleave("= %ld [buffer too small]", ret); return ret; } /* * Allocate a keyring and link into the destination keyring. */ struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid, const struct cred *cred, key_perm_t perm, unsigned long flags, struct key_restriction *restrict_link, struct key *dest) { struct key *keyring; int ret; keyring = key_alloc(&key_type_keyring, description, uid, gid, cred, perm, flags, restrict_link); if (!IS_ERR(keyring)) { ret = key_instantiate_and_link(keyring, NULL, 0, dest, NULL); if (ret < 0) { key_put(keyring); keyring = ERR_PTR(ret); } } return keyring; } EXPORT_SYMBOL(keyring_alloc); /** * restrict_link_reject - Give -EPERM to restrict link * @keyring: The keyring being added to. * @type: The type of key being added. * @payload: The payload of the key intended to be added. * @restriction_key: Keys providing additional data for evaluating restriction. * * Reject the addition of any links to a keyring. It can be overridden by * passing KEY_ALLOC_BYPASS_RESTRICTION to key_instantiate_and_link() when * adding a key to a keyring. * * This is meant to be stored in a key_restriction structure which is passed * in the restrict_link parameter to keyring_alloc(). */ int restrict_link_reject(struct key *keyring, const struct key_type *type, const union key_payload *payload, struct key *restriction_key) { return -EPERM; } /* * By default, we keys found by getting an exact match on their descriptions. */ bool key_default_cmp(const struct key *key, const struct key_match_data *match_data) { return strcmp(key->description, match_data->raw_data) == 0; } /* * Iteration function to consider each key found. */ static int keyring_search_iterator(const void *object, void *iterator_data) { struct keyring_search_context *ctx = iterator_data; const struct key *key = keyring_ptr_to_key(object); unsigned long kflags = READ_ONCE(key->flags); short state = READ_ONCE(key->state); kenter("{%d}", key->serial); /* ignore keys not of this type */ if (key->type != ctx->index_key.type) { kleave(" = 0 [!type]"); return 0; } /* skip invalidated, revoked and expired keys */ if (ctx->flags & KEYRING_SEARCH_DO_STATE_CHECK) { time64_t expiry = READ_ONCE(key->expiry); if (kflags & ((1 << KEY_FLAG_INVALIDATED) | (1 << KEY_FLAG_REVOKED))) { ctx->result = ERR_PTR(-EKEYREVOKED); kleave(" = %d [invrev]", ctx->skipped_ret); goto skipped; } if (expiry && ctx->now >= expiry) { if (!(ctx->flags & KEYRING_SEARCH_SKIP_EXPIRED)) ctx->result = ERR_PTR(-EKEYEXPIRED); kleave(" = %d [expire]", ctx->skipped_ret); goto skipped; } } /* keys that don't match */ if (!ctx->match_data.cmp(key, &ctx->match_data)) { kleave(" = 0 [!match]"); return 0; } /* key must have search permissions */ if (!(ctx->flags & KEYRING_SEARCH_NO_CHECK_PERM) && key_task_permission(make_key_ref(key, ctx->possessed), ctx->cred, KEY_NEED_SEARCH) < 0) { ctx->result = ERR_PTR(-EACCES); kleave(" = %d [!perm]", ctx->skipped_ret); goto skipped; } if (ctx->flags & KEYRING_SEARCH_DO_STATE_CHECK) { /* we set a different error code if we pass a negative key */ if (state < 0) { ctx->result = ERR_PTR(state); kleave(" = %d [neg]", ctx->skipped_ret); goto skipped; } } /* Found */ ctx->result = make_key_ref(key, ctx->possessed); kleave(" = 1 [found]"); return 1; skipped: return ctx->skipped_ret; } /* * Search inside a keyring for a key. We can search by walking to it * directly based on its index-key or we can iterate over the entire * tree looking for it, based on the match function. */ static int search_keyring(struct key *keyring, struct keyring_search_context *ctx) { if (ctx->match_data.lookup_type == KEYRING_SEARCH_LOOKUP_DIRECT) { const void *object; object = assoc_array_find(&keyring->keys, &keyring_assoc_array_ops, &ctx->index_key); return object ? ctx->iterator(object, ctx) : 0; } return assoc_array_iterate(&keyring->keys, ctx->iterator, ctx); } /* * Search a tree of keyrings that point to other keyrings up to the maximum * depth. */ static bool search_nested_keyrings(struct key *keyring, struct keyring_search_context *ctx) { struct { struct key *keyring; struct assoc_array_node *node; int slot; } stack[KEYRING_SEARCH_MAX_DEPTH]; struct assoc_array_shortcut *shortcut; struct assoc_array_node *node; struct assoc_array_ptr *ptr; struct key *key; int sp = 0, slot; kenter("{%d},{%s,%s}", keyring->serial, ctx->index_key.type->name, ctx->index_key.description); #define STATE_CHECKS (KEYRING_SEARCH_NO_STATE_CHECK | KEYRING_SEARCH_DO_STATE_CHECK) BUG_ON((ctx->flags & STATE_CHECKS) == 0 || (ctx->flags & STATE_CHECKS) == STATE_CHECKS); if (ctx->index_key.description) key_set_index_key(&ctx->index_key); /* Check to see if this top-level keyring is what we are looking for * and whether it is valid or not. */ if (ctx->match_data.lookup_type == KEYRING_SEARCH_LOOKUP_ITERATE || keyring_compare_object(keyring, &ctx->index_key)) { ctx->skipped_ret = 2; switch (ctx->iterator(keyring_key_to_ptr(keyring), ctx)) { case 1: goto found; case 2: return false; default: break; } } ctx->skipped_ret = 0; /* Start processing a new keyring */ descend_to_keyring: kdebug("descend to %d", keyring->serial); if (keyring->flags & ((1 << KEY_FLAG_INVALIDATED) | (1 << KEY_FLAG_REVOKED))) goto not_this_keyring; /* Search through the keys in this keyring before its searching its * subtrees. */ if (search_keyring(keyring, ctx)) goto found; /* Then manually iterate through the keyrings nested in this one. * * Start from the root node of the index tree. Because of the way the * hash function has been set up, keyrings cluster on the leftmost * branch of the root node (root slot 0) or in the root node itself. * Non-keyrings avoid the leftmost branch of the root entirely (root * slots 1-15). */ if (!(ctx->flags & KEYRING_SEARCH_RECURSE)) goto not_this_keyring; ptr = READ_ONCE(keyring->keys.root); if (!ptr) goto not_this_keyring; if (assoc_array_ptr_is_shortcut(ptr)) { /* If the root is a shortcut, either the keyring only contains * keyring pointers (everything clusters behind root slot 0) or * doesn't contain any keyring pointers. */ shortcut = assoc_array_ptr_to_shortcut(ptr); if ((shortcut->index_key[0] & ASSOC_ARRAY_FAN_MASK) != 0) goto not_this_keyring; ptr = READ_ONCE(shortcut->next_node); node = assoc_array_ptr_to_node(ptr); goto begin_node; } node = assoc_array_ptr_to_node(ptr); ptr = node->slots[0]; if (!assoc_array_ptr_is_meta(ptr)) goto begin_node; descend_to_node: /* Descend to a more distal node in this keyring's content tree and go * through that. */ kdebug("descend"); if (assoc_array_ptr_is_shortcut(ptr)) { shortcut = assoc_array_ptr_to_shortcut(ptr); ptr = READ_ONCE(shortcut->next_node); BUG_ON(!assoc_array_ptr_is_node(ptr)); } node = assoc_array_ptr_to_node(ptr); begin_node: kdebug("begin_node"); slot = 0; ascend_to_node: /* Go through the slots in a node */ for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) { ptr = READ_ONCE(node->slots[slot]); if (assoc_array_ptr_is_meta(ptr)) { if (node->back_pointer || assoc_array_ptr_is_shortcut(ptr)) goto descend_to_node; } if (!keyring_ptr_is_keyring(ptr)) continue; key = keyring_ptr_to_key(ptr); if (sp >= KEYRING_SEARCH_MAX_DEPTH) { if (ctx->flags & KEYRING_SEARCH_DETECT_TOO_DEEP) { ctx->result = ERR_PTR(-ELOOP); return false; } goto not_this_keyring; } /* Search a nested keyring */ if (!(ctx->flags & KEYRING_SEARCH_NO_CHECK_PERM) && key_task_permission(make_key_ref(key, ctx->possessed), ctx->cred, KEY_NEED_SEARCH) < 0) continue; /* stack the current position */ stack[sp].keyring = keyring; stack[sp].node = node; stack[sp].slot = slot; sp++; /* begin again with the new keyring */ keyring = key; goto descend_to_keyring; } /* We've dealt with all the slots in the current node, so now we need * to ascend to the parent and continue processing there. */ ptr = READ_ONCE(node->back_pointer); slot = node->parent_slot; if (ptr && assoc_array_ptr_is_shortcut(ptr)) { shortcut = assoc_array_ptr_to_shortcut(ptr); ptr = READ_ONCE(shortcut->back_pointer); slot = shortcut->parent_slot; } if (!ptr) goto not_this_keyring; node = assoc_array_ptr_to_node(ptr); slot++; /* If we've ascended to the root (zero backpointer), we must have just * finished processing the leftmost branch rather than the root slots - * so there can't be any more keyrings for us to find. */ if (node->back_pointer) { kdebug("ascend %d", slot); goto ascend_to_node; } /* The keyring we're looking at was disqualified or didn't contain a * matching key. */ not_this_keyring: kdebug("not_this_keyring %d", sp); if (sp <= 0) { kleave(" = false"); return false; } /* Resume the processing of a keyring higher up in the tree */ sp--; keyring = stack[sp].keyring; node = stack[sp].node; slot = stack[sp].slot + 1; kdebug("ascend to %d [%d]", keyring->serial, slot); goto ascend_to_node; /* We found a viable match */ found: key = key_ref_to_ptr(ctx->result); key_check(key); if (!(ctx->flags & KEYRING_SEARCH_NO_UPDATE_TIME)) { key->last_used_at = ctx->now; keyring->last_used_at = ctx->now; while (sp > 0) stack[--sp].keyring->last_used_at = ctx->now; } kleave(" = true"); return true; } /** * keyring_search_rcu - Search a keyring tree for a matching key under RCU * @keyring_ref: A pointer to the keyring with possession indicator. * @ctx: The keyring search context. * * Search the supplied keyring tree for a key that matches the criteria given. * The root keyring and any linked keyrings must grant Search permission to the * caller to be searchable and keys can only be found if they too grant Search * to the caller. The possession flag on the root keyring pointer controls use * of the possessor bits in permissions checking of the entire tree. In * addition, the LSM gets to forbid keyring searches and key matches. * * The search is performed as a breadth-then-depth search up to the prescribed * limit (KEYRING_SEARCH_MAX_DEPTH). The caller must hold the RCU read lock to * prevent keyrings from being destroyed or rearranged whilst they are being * searched. * * Keys are matched to the type provided and are then filtered by the match * function, which is given the description to use in any way it sees fit. The * match function may use any attributes of a key that it wishes to * determine the match. Normally the match function from the key type would be * used. * * RCU can be used to prevent the keyring key lists from disappearing without * the need to take lots of locks. * * Returns a pointer to the found key and increments the key usage count if * successful; -EAGAIN if no matching keys were found, or if expired or revoked * keys were found; -ENOKEY if only negative keys were found; -ENOTDIR if the * specified keyring wasn't a keyring. * * In the case of a successful return, the possession attribute from * @keyring_ref is propagated to the returned key reference. */ key_ref_t keyring_search_rcu(key_ref_t keyring_ref, struct keyring_search_context *ctx) { struct key *keyring; long err; ctx->iterator = keyring_search_iterator; ctx->possessed = is_key_possessed(keyring_ref); ctx->result = ERR_PTR(-EAGAIN); keyring = key_ref_to_ptr(keyring_ref); key_check(keyring); if (keyring->type != &key_type_keyring) return ERR_PTR(-ENOTDIR); if (!(ctx->flags & KEYRING_SEARCH_NO_CHECK_PERM)) { err = key_task_permission(keyring_ref, ctx->cred, KEY_NEED_SEARCH); if (err < 0) return ERR_PTR(err); } ctx->now = ktime_get_real_seconds(); if (search_nested_keyrings(keyring, ctx)) __key_get(key_ref_to_ptr(ctx->result)); return ctx->result; } /** * keyring_search - Search the supplied keyring tree for a matching key * @keyring: The root of the keyring tree to be searched. * @type: The type of keyring we want to find. * @description: The name of the keyring we want to find. * @recurse: True to search the children of @keyring also * * As keyring_search_rcu() above, but using the current task's credentials and * type's default matching function and preferred search method. */ key_ref_t keyring_search(key_ref_t keyring, struct key_type *type, const char *description, bool recurse) { struct keyring_search_context ctx = { .index_key.type = type, .index_key.description = description, .index_key.desc_len = strlen(description), .cred = current_cred(), .match_data.cmp = key_default_cmp, .match_data.raw_data = description, .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, .flags = KEYRING_SEARCH_DO_STATE_CHECK, }; key_ref_t key; int ret; if (recurse) ctx.flags |= KEYRING_SEARCH_RECURSE; if (type->match_preparse) { ret = type->match_preparse(&ctx.match_data); if (ret < 0) return ERR_PTR(ret); } rcu_read_lock(); key = keyring_search_rcu(keyring, &ctx); rcu_read_unlock(); if (type->match_free) type->match_free(&ctx.match_data); return key; } EXPORT_SYMBOL(keyring_search); static struct key_restriction *keyring_restriction_alloc( key_restrict_link_func_t check) { struct key_restriction *keyres = kzalloc(sizeof(struct key_restriction), GFP_KERNEL); if (!keyres) return ERR_PTR(-ENOMEM); keyres->check = check; return keyres; } /* * Semaphore to serialise restriction setup to prevent reference count * cycles through restriction key pointers. */ static DECLARE_RWSEM(keyring_serialise_restrict_sem); /* * Check for restriction cycles that would prevent keyring garbage collection. * keyring_serialise_restrict_sem must be held. */ static bool keyring_detect_restriction_cycle(const struct key *dest_keyring, struct key_restriction *keyres) { while (keyres && keyres->key && keyres->key->type == &key_type_keyring) { if (keyres->key == dest_keyring) return true; keyres = keyres->key->restrict_link; } return false; } /** * keyring_restrict - Look up and apply a restriction to a keyring * @keyring_ref: The keyring to be restricted * @type: The key type that will provide the restriction checker. * @restriction: The restriction options to apply to the keyring * * Look up a keyring and apply a restriction to it. The restriction is managed * by the specific key type, but can be configured by the options specified in * the restriction string. */ int keyring_restrict(key_ref_t keyring_ref, const char *type, const char *restriction) { struct key *keyring; struct key_type *restrict_type = NULL; struct key_restriction *restrict_link; int ret = 0; keyring = key_ref_to_ptr(keyring_ref); key_check(keyring); if (keyring->type != &key_type_keyring) return -ENOTDIR; if (!type) { restrict_link = keyring_restriction_alloc(restrict_link_reject); } else { restrict_type = key_type_lookup(type); if (IS_ERR(restrict_type)) return PTR_ERR(restrict_type); if (!restrict_type->lookup_restriction) { ret = -ENOENT; goto error; } restrict_link = restrict_type->lookup_restriction(restriction); } if (IS_ERR(restrict_link)) { ret = PTR_ERR(restrict_link); goto error; } down_write(&keyring->sem); down_write(&keyring_serialise_restrict_sem); if (keyring->restrict_link) { ret = -EEXIST; } else if (keyring_detect_restriction_cycle(keyring, restrict_link)) { ret = -EDEADLK; } else { keyring->restrict_link = restrict_link; notify_key(keyring, NOTIFY_KEY_SETATTR, 0); } up_write(&keyring_serialise_restrict_sem); up_write(&keyring->sem); if (ret < 0) { key_put(restrict_link->key); kfree(restrict_link); } error: if (restrict_type) key_type_put(restrict_type); return ret; } EXPORT_SYMBOL(keyring_restrict); /* * Search the given keyring for a key that might be updated. * * The caller must guarantee that the keyring is a keyring and that the * permission is granted to modify the keyring as no check is made here. The * caller must also hold a lock on the keyring semaphore. * * Returns a pointer to the found key with usage count incremented if * successful and returns NULL if not found. Revoked and invalidated keys are * skipped over. * * If successful, the possession indicator is propagated from the keyring ref * to the returned key reference. */ key_ref_t find_key_to_update(key_ref_t keyring_ref, const struct keyring_index_key *index_key) { struct key *keyring, *key; const void *object; keyring = key_ref_to_ptr(keyring_ref); kenter("{%d},{%s,%s}", keyring->serial, index_key->type->name, index_key->description); object = assoc_array_find(&keyring->keys, &keyring_assoc_array_ops, index_key); if (object) goto found; kleave(" = NULL"); return NULL; found: key = keyring_ptr_to_key(object); if (key->flags & ((1 << KEY_FLAG_INVALIDATED) | (1 << KEY_FLAG_REVOKED))) { kleave(" = NULL [x]"); return NULL; } __key_get(key); kleave(" = {%d}", key->serial); return make_key_ref(key, is_key_possessed(keyring_ref)); } /* * Find a keyring with the specified name. * * Only keyrings that have nonzero refcount, are not revoked, and are owned by a * user in the current user namespace are considered. If @uid_keyring is %true, * the keyring additionally must have been allocated as a user or user session * keyring; otherwise, it must grant Search permission directly to the caller. * * Returns a pointer to the keyring with the keyring's refcount having being * incremented on success. -ENOKEY is returned if a key could not be found. */ struct key *find_keyring_by_name(const char *name, bool uid_keyring) { struct user_namespace *ns = current_user_ns(); struct key *keyring; if (!name) return ERR_PTR(-EINVAL); read_lock(&keyring_name_lock); /* Search this hash bucket for a keyring with a matching name that * grants Search permission and that hasn't been revoked */ list_for_each_entry(keyring, &ns->keyring_name_list, name_link) { if (!kuid_has_mapping(ns, keyring->user->uid)) continue; if (test_bit(KEY_FLAG_REVOKED, &keyring->flags)) continue; if (strcmp(keyring->description, name) != 0) continue; if (uid_keyring) { if (!test_bit(KEY_FLAG_UID_KEYRING, &keyring->flags)) continue; } else { if (key_permission(make_key_ref(keyring, 0), KEY_NEED_SEARCH) < 0) continue; } /* we've got a match but we might end up racing with * key_cleanup() if the keyring is currently 'dead' * (ie. it has a zero usage count) */ if (!refcount_inc_not_zero(&keyring->usage)) continue; keyring->last_used_at = ktime_get_real_seconds(); goto out; } keyring = ERR_PTR(-ENOKEY); out: read_unlock(&keyring_name_lock); return keyring; } static int keyring_detect_cycle_iterator(const void *object, void *iterator_data) { struct keyring_search_context *ctx = iterator_data; const struct key *key = keyring_ptr_to_key(object); kenter("{%d}", key->serial); /* We might get a keyring with matching index-key that is nonetheless a * different keyring. */ if (key != ctx->match_data.raw_data) return 0; ctx->result = ERR_PTR(-EDEADLK); return 1; } /* * See if a cycle will be created by inserting acyclic tree B in acyclic * tree A at the topmost level (ie: as a direct child of A). * * Since we are adding B to A at the top level, checking for cycles should just * be a matter of seeing if node A is somewhere in tree B. */ static int keyring_detect_cycle(struct key *A, struct key *B) { struct keyring_search_context ctx = { .index_key = A->index_key, .match_data.raw_data = A, .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, .iterator = keyring_detect_cycle_iterator, .flags = (KEYRING_SEARCH_NO_STATE_CHECK | KEYRING_SEARCH_NO_UPDATE_TIME | KEYRING_SEARCH_NO_CHECK_PERM | KEYRING_SEARCH_DETECT_TOO_DEEP | KEYRING_SEARCH_RECURSE), }; rcu_read_lock(); search_nested_keyrings(B, &ctx); rcu_read_unlock(); return PTR_ERR(ctx.result) == -EAGAIN ? 0 : PTR_ERR(ctx.result); } /* * Lock keyring for link. */ int __key_link_lock(struct key *keyring, const struct keyring_index_key *index_key) __acquires(&keyring->sem) __acquires(&keyring_serialise_link_lock) { if (keyring->type != &key_type_keyring) return -ENOTDIR; down_write(&keyring->sem); /* Serialise link/link calls to prevent parallel calls causing a cycle * when linking two keyring in opposite orders. */ if (index_key->type == &key_type_keyring) mutex_lock(&keyring_serialise_link_lock); return 0; } /* * Lock keyrings for move (link/unlink combination). */ int __key_move_lock(struct key *l_keyring, struct key *u_keyring, const struct keyring_index_key *index_key) __acquires(&l_keyring->sem) __acquires(&u_keyring->sem) __acquires(&keyring_serialise_link_lock) { if (l_keyring->type != &key_type_keyring || u_keyring->type != &key_type_keyring) return -ENOTDIR; /* We have to be very careful here to take the keyring locks in the * right order, lest we open ourselves to deadlocking against another * move operation. */ if (l_keyring < u_keyring) { down_write(&l_keyring->sem); down_write_nested(&u_keyring->sem, 1); } else { down_write(&u_keyring->sem); down_write_nested(&l_keyring->sem, 1); } /* Serialise link/link calls to prevent parallel calls causing a cycle * when linking two keyring in opposite orders. */ if (index_key->type == &key_type_keyring) mutex_lock(&keyring_serialise_link_lock); return 0; } /* * Preallocate memory so that a key can be linked into to a keyring. */ int __key_link_begin(struct key *keyring, const struct keyring_index_key *index_key, struct assoc_array_edit **_edit) { struct assoc_array_edit *edit; int ret; kenter("%d,%s,%s,", keyring->serial, index_key->type->name, index_key->description); BUG_ON(index_key->desc_len == 0); BUG_ON(*_edit != NULL); *_edit = NULL; ret = -EKEYREVOKED; if (test_bit(KEY_FLAG_REVOKED, &keyring->flags)) goto error; /* Create an edit script that will insert/replace the key in the * keyring tree. */ edit = assoc_array_insert(&keyring->keys, &keyring_assoc_array_ops, index_key, NULL); if (IS_ERR(edit)) { ret = PTR_ERR(edit); goto error; } /* If we're not replacing a link in-place then we're going to need some * extra quota. */ if (!edit->dead_leaf) { ret = key_payload_reserve(keyring, keyring->datalen + KEYQUOTA_LINK_BYTES); if (ret < 0) goto error_cancel; } *_edit = edit; kleave(" = 0"); return 0; error_cancel: assoc_array_cancel_edit(edit); error: kleave(" = %d", ret); return ret; } /* * Check already instantiated keys aren't going to be a problem. * * The caller must have called __key_link_begin(). Don't need to call this for * keys that were created since __key_link_begin() was called. */ int __key_link_check_live_key(struct key *keyring, struct key *key) { if (key->type == &key_type_keyring) /* check that we aren't going to create a cycle by linking one * keyring to another */ return keyring_detect_cycle(keyring, key); return 0; } /* * Link a key into to a keyring. * * Must be called with __key_link_begin() having being called. Discards any * already extant link to matching key if there is one, so that each keyring * holds at most one link to any given key of a particular type+description * combination. */ void __key_link(struct key *keyring, struct key *key, struct assoc_array_edit **_edit) { __key_get(key); assoc_array_insert_set_object(*_edit, keyring_key_to_ptr(key)); assoc_array_apply_edit(*_edit); *_edit = NULL; notify_key(keyring, NOTIFY_KEY_LINKED, key_serial(key)); } /* * Finish linking a key into to a keyring. * * Must be called with __key_link_begin() having being called. */ void __key_link_end(struct key *keyring, const struct keyring_index_key *index_key, struct assoc_array_edit *edit) __releases(&keyring->sem) __releases(&keyring_serialise_link_lock) { BUG_ON(index_key->type == NULL); kenter("%d,%s,", keyring->serial, index_key->type->name); if (edit) { if (!edit->dead_leaf) { key_payload_reserve(keyring, keyring->datalen - KEYQUOTA_LINK_BYTES); } assoc_array_cancel_edit(edit); } up_write(&keyring->sem); if (index_key->type == &key_type_keyring) mutex_unlock(&keyring_serialise_link_lock); } /* * Check addition of keys to restricted keyrings. */ static int __key_link_check_restriction(struct key *keyring, struct key *key) { if (!keyring->restrict_link || !keyring->restrict_link->check) return 0; return keyring->restrict_link->check(keyring, key->type, &key->payload, keyring->restrict_link->key); } /** * key_link - Link a key to a keyring * @keyring: The keyring to make the link in. * @key: The key to link to. * * Make a link in a keyring to a key, such that the keyring holds a reference * on that key and the key can potentially be found by searching that keyring. * * This function will write-lock the keyring's semaphore and will consume some * of the user's key data quota to hold the link. * * Returns 0 if successful, -ENOTDIR if the keyring isn't a keyring, * -EKEYREVOKED if the keyring has been revoked, -ENFILE if the keyring is * full, -EDQUOT if there is insufficient key data quota remaining to add * another link or -ENOMEM if there's insufficient memory. * * It is assumed that the caller has checked that it is permitted for a link to * be made (the keyring should have Write permission and the key Link * permission). */ int key_link(struct key *keyring, struct key *key) { struct assoc_array_edit *edit = NULL; int ret; kenter("{%d,%d}", keyring->serial, refcount_read(&keyring->usage)); key_check(keyring); key_check(key); ret = __key_link_lock(keyring, &key->index_key); if (ret < 0) goto error; ret = __key_link_begin(keyring, &key->index_key, &edit); if (ret < 0) goto error_end; kdebug("begun {%d,%d}", keyring->serial, refcount_read(&keyring->usage)); ret = __key_link_check_restriction(keyring, key); if (ret == 0) ret = __key_link_check_live_key(keyring, key); if (ret == 0) __key_link(keyring, key, &edit); error_end: __key_link_end(keyring, &key->index_key, edit); error: kleave(" = %d {%d,%d}", ret, keyring->serial, refcount_read(&keyring->usage)); return ret; } EXPORT_SYMBOL(key_link); /* * Lock a keyring for unlink. */ static int __key_unlink_lock(struct key *keyring) __acquires(&keyring->sem) { if (keyring->type != &key_type_keyring) return -ENOTDIR; down_write(&keyring->sem); return 0; } /* * Begin the process of unlinking a key from a keyring. */ static int __key_unlink_begin(struct key *keyring, struct key *key, struct assoc_array_edit **_edit) { struct assoc_array_edit *edit; BUG_ON(*_edit != NULL); edit = assoc_array_delete(&keyring->keys, &keyring_assoc_array_ops, &key->index_key); if (IS_ERR(edit)) return PTR_ERR(edit); if (!edit) return -ENOENT; *_edit = edit; return 0; } /* * Apply an unlink change. */ static void __key_unlink(struct key *keyring, struct key *key, struct assoc_array_edit **_edit) { assoc_array_apply_edit(*_edit); notify_key(keyring, NOTIFY_KEY_UNLINKED, key_serial(key)); *_edit = NULL; key_payload_reserve(keyring, keyring->datalen - KEYQUOTA_LINK_BYTES); } /* * Finish unlinking a key from to a keyring. */ static void __key_unlink_end(struct key *keyring, struct key *key, struct assoc_array_edit *edit) __releases(&keyring->sem) { if (edit) assoc_array_cancel_edit(edit); up_write(&keyring->sem); } /** * key_unlink - Unlink the first link to a key from a keyring. * @keyring: The keyring to remove the link from. * @key: The key the link is to. * * Remove a link from a keyring to a key. * * This function will write-lock the keyring's semaphore. * * Returns 0 if successful, -ENOTDIR if the keyring isn't a keyring, -ENOENT if * the key isn't linked to by the keyring or -ENOMEM if there's insufficient * memory. * * It is assumed that the caller has checked that it is permitted for a link to * be removed (the keyring should have Write permission; no permissions are * required on the key). */ int key_unlink(struct key *keyring, struct key *key) { struct assoc_array_edit *edit = NULL; int ret; key_check(keyring); key_check(key); ret = __key_unlink_lock(keyring); if (ret < 0) return ret; ret = __key_unlink_begin(keyring, key, &edit); if (ret == 0) __key_unlink(keyring, key, &edit); __key_unlink_end(keyring, key, edit); return ret; } EXPORT_SYMBOL(key_unlink); /** * key_move - Move a key from one keyring to another * @key: The key to move * @from_keyring: The keyring to remove the link from. * @to_keyring: The keyring to make the link in. * @flags: Qualifying flags, such as KEYCTL_MOVE_EXCL. * * Make a link in @to_keyring to a key, such that the keyring holds a reference * on that key and the key can potentially be found by searching that keyring * whilst simultaneously removing a link to the key from @from_keyring. * * This function will write-lock both keyring's semaphores and will consume * some of the user's key data quota to hold the link on @to_keyring. * * Returns 0 if successful, -ENOTDIR if either keyring isn't a keyring, * -EKEYREVOKED if either keyring has been revoked, -ENFILE if the second * keyring is full, -EDQUOT if there is insufficient key data quota remaining * to add another link or -ENOMEM if there's insufficient memory. If * KEYCTL_MOVE_EXCL is set, then -EEXIST will be returned if there's already a * matching key in @to_keyring. * * It is assumed that the caller has checked that it is permitted for a link to * be made (the keyring should have Write permission and the key Link * permission). */ int key_move(struct key *key, struct key *from_keyring, struct key *to_keyring, unsigned int flags) { struct assoc_array_edit *from_edit = NULL, *to_edit = NULL; int ret; kenter("%d,%d,%d", key->serial, from_keyring->serial, to_keyring->serial); if (from_keyring == to_keyring) return 0; key_check(key); key_check(from_keyring); key_check(to_keyring); ret = __key_move_lock(from_keyring, to_keyring, &key->index_key); if (ret < 0) goto out; ret = __key_unlink_begin(from_keyring, key, &from_edit); if (ret < 0) goto error; ret = __key_link_begin(to_keyring, &key->index_key, &to_edit); if (ret < 0) goto error; ret = -EEXIST; if (to_edit->dead_leaf && (flags & KEYCTL_MOVE_EXCL)) goto error; ret = __key_link_check_restriction(to_keyring, key); if (ret < 0) goto error; ret = __key_link_check_live_key(to_keyring, key); if (ret < 0) goto error; __key_unlink(from_keyring, key, &from_edit); __key_link(to_keyring, key, &to_edit); error: __key_link_end(to_keyring, &key->index_key, to_edit); __key_unlink_end(from_keyring, key, from_edit); out: kleave(" = %d", ret); return ret; } EXPORT_SYMBOL(key_move); /** * keyring_clear - Clear a keyring * @keyring: The keyring to clear. * * Clear the contents of the specified keyring. * * Returns 0 if successful or -ENOTDIR if the keyring isn't a keyring. */ int keyring_clear(struct key *keyring) { struct assoc_array_edit *edit; int ret; if (keyring->type != &key_type_keyring) return -ENOTDIR; down_write(&keyring->sem); edit = assoc_array_clear(&keyring->keys, &keyring_assoc_array_ops); if (IS_ERR(edit)) { ret = PTR_ERR(edit); } else { if (edit) assoc_array_apply_edit(edit); notify_key(keyring, NOTIFY_KEY_CLEARED, 0); key_payload_reserve(keyring, 0); ret = 0; } up_write(&keyring->sem); return ret; } EXPORT_SYMBOL(keyring_clear); /* * Dispose of the links from a revoked keyring. * * This is called with the key sem write-locked. */ static void keyring_revoke(struct key *keyring) { struct assoc_array_edit *edit; edit = assoc_array_clear(&keyring->keys, &keyring_assoc_array_ops); if (!IS_ERR(edit)) { if (edit) assoc_array_apply_edit(edit); key_payload_reserve(keyring, 0); } } static bool keyring_gc_select_iterator(void *object, void *iterator_data) { struct key *key = keyring_ptr_to_key(object); time64_t *limit = iterator_data; if (key_is_dead(key, *limit)) return false; key_get(key); return true; } static int keyring_gc_check_iterator(const void *object, void *iterator_data) { const struct key *key = keyring_ptr_to_key(object); time64_t *limit = iterator_data; key_check(key); return key_is_dead(key, *limit); } /* * Garbage collect pointers from a keyring. * * Not called with any locks held. The keyring's key struct will not be * deallocated under us as only our caller may deallocate it. */ void keyring_gc(struct key *keyring, time64_t limit) { int result; kenter("%x{%s}", keyring->serial, keyring->description ?: ""); if (keyring->flags & ((1 << KEY_FLAG_INVALIDATED) | (1 << KEY_FLAG_REVOKED))) goto dont_gc; /* scan the keyring looking for dead keys */ rcu_read_lock(); result = assoc_array_iterate(&keyring->keys, keyring_gc_check_iterator, &limit); rcu_read_unlock(); if (result == true) goto do_gc; dont_gc: kleave(" [no gc]"); return; do_gc: down_write(&keyring->sem); assoc_array_gc(&keyring->keys, &keyring_assoc_array_ops, keyring_gc_select_iterator, &limit); up_write(&keyring->sem); kleave(" [gc]"); } /* * Garbage collect restriction pointers from a keyring. * * Keyring restrictions are associated with a key type, and must be cleaned * up if the key type is unregistered. The restriction is altered to always * reject additional keys so a keyring cannot be opened up by unregistering * a key type. * * Not called with any keyring locks held. The keyring's key struct will not * be deallocated under us as only our caller may deallocate it. * * The caller is required to hold key_types_sem and dead_type->sem. This is * fulfilled by key_gc_keytype() holding the locks on behalf of * key_garbage_collector(), which it invokes on a workqueue. */ void keyring_restriction_gc(struct key *keyring, struct key_type *dead_type) { struct key_restriction *keyres; kenter("%x{%s}", keyring->serial, keyring->description ?: ""); /* * keyring->restrict_link is only assigned at key allocation time * or with the key type locked, so the only values that could be * concurrently assigned to keyring->restrict_link are for key * types other than dead_type. Given this, it's ok to check * the key type before acquiring keyring->sem. */ if (!dead_type || !keyring->restrict_link || keyring->restrict_link->keytype != dead_type) { kleave(" [no restriction gc]"); return; } /* Lock the keyring to ensure that a link is not in progress */ down_write(&keyring->sem); keyres = keyring->restrict_link; keyres->check = restrict_link_reject; key_put(keyres->key); keyres->key = NULL; keyres->keytype = NULL; up_write(&keyring->sem); kleave(" [restriction gc]"); }
42 43 43 1 1 1 1 1 1 4 4 1 3 4 4 4 4 86 86 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2008-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2021-2024 Intel Corporation */ #include <linux/export.h> #include <linux/etherdevice.h> #include <net/mac80211.h> #include <linux/unaligned.h> #include "ieee80211_i.h" #include "rate.h" #include "mesh.h" #include "led.h" #include "wme.h" void ieee80211_tx_status_irqsafe(struct ieee80211_hw *hw, struct sk_buff *skb) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); int tmp; skb->pkt_type = IEEE80211_TX_STATUS_MSG; skb_queue_tail(info->flags & IEEE80211_TX_CTL_REQ_TX_STATUS ? &local->skb_queue : &local->skb_queue_unreliable, skb); tmp = skb_queue_len(&local->skb_queue) + skb_queue_len(&local->skb_queue_unreliable); while (tmp > IEEE80211_IRQSAFE_QUEUE_LIMIT && (skb = skb_dequeue(&local->skb_queue_unreliable))) { ieee80211_free_txskb(hw, skb); tmp--; I802_DEBUG_INC(local->tx_status_drop); } tasklet_schedule(&local->tasklet); } EXPORT_SYMBOL(ieee80211_tx_status_irqsafe); static void ieee80211_handle_filtered_frame(struct ieee80211_local *local, struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr = (void *)skb->data; int ac; if (info->flags & (IEEE80211_TX_CTL_NO_PS_BUFFER | IEEE80211_TX_CTL_AMPDU | IEEE80211_TX_CTL_HW_80211_ENCAP)) { ieee80211_free_txskb(&local->hw, skb); return; } /* * This skb 'survived' a round-trip through the driver, and * hopefully the driver didn't mangle it too badly. However, * we can definitely not rely on the control information * being correct. Clear it so we don't get junk there, and * indicate that it needs new processing, but must not be * modified/encrypted again. */ memset(&info->control, 0, sizeof(info->control)); info->control.jiffies = jiffies; info->control.vif = &sta->sdata->vif; info->control.flags |= IEEE80211_TX_INTCFL_NEED_TXPROCESSING; info->flags |= IEEE80211_TX_INTFL_RETRANSMISSION; info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS; sta->deflink.status_stats.filtered++; /* * Clear more-data bit on filtered frames, it might be set * but later frames might time out so it might have to be * clear again ... It's all rather unlikely (this frame * should time out first, right?) but let's not confuse * peers unnecessarily. */ if (hdr->frame_control & cpu_to_le16(IEEE80211_FCTL_MOREDATA)) hdr->frame_control &= ~cpu_to_le16(IEEE80211_FCTL_MOREDATA); if (ieee80211_is_data_qos(hdr->frame_control)) { u8 *p = ieee80211_get_qos_ctl(hdr); int tid = *p & IEEE80211_QOS_CTL_TID_MASK; /* * Clear EOSP if set, this could happen e.g. * if an absence period (us being a P2P GO) * shortens the SP. */ if (*p & IEEE80211_QOS_CTL_EOSP) *p &= ~IEEE80211_QOS_CTL_EOSP; ac = ieee80211_ac_from_tid(tid); } else { ac = IEEE80211_AC_BE; } /* * Clear the TX filter mask for this STA when sending the next * packet. If the STA went to power save mode, this will happen * when it wakes up for the next time. */ set_sta_flag(sta, WLAN_STA_CLEAR_PS_FILT); ieee80211_clear_fast_xmit(sta); /* * This code races in the following way: * * (1) STA sends frame indicating it will go to sleep and does so * (2) hardware/firmware adds STA to filter list, passes frame up * (3) hardware/firmware processes TX fifo and suppresses a frame * (4) we get TX status before having processed the frame and * knowing that the STA has gone to sleep. * * This is actually quite unlikely even when both those events are * processed from interrupts coming in quickly after one another or * even at the same time because we queue both TX status events and * RX frames to be processed by a tasklet and process them in the * same order that they were received or TX status last. Hence, there * is no race as long as the frame RX is processed before the next TX * status, which drivers can ensure, see below. * * Note that this can only happen if the hardware or firmware can * actually add STAs to the filter list, if this is done by the * driver in response to set_tim() (which will only reduce the race * this whole filtering tries to solve, not completely solve it) * this situation cannot happen. * * To completely solve this race drivers need to make sure that they * (a) don't mix the irq-safe/not irq-safe TX status/RX processing * functions and * (b) always process RX events before TX status events if ordering * can be unknown, for example with different interrupt status * bits. * (c) if PS mode transitions are manual (i.e. the flag * %IEEE80211_HW_AP_LINK_PS is set), always process PS state * changes before calling TX status events if ordering can be * unknown. */ if (test_sta_flag(sta, WLAN_STA_PS_STA) && skb_queue_len(&sta->tx_filtered[ac]) < STA_MAX_TX_BUFFER) { skb_queue_tail(&sta->tx_filtered[ac], skb); sta_info_recalc_tim(sta); if (!timer_pending(&local->sta_cleanup)) mod_timer(&local->sta_cleanup, round_jiffies(jiffies + STA_INFO_CLEANUP_INTERVAL)); return; } if (!test_sta_flag(sta, WLAN_STA_PS_STA) && !(info->flags & IEEE80211_TX_INTFL_RETRIED)) { /* Software retry the packet once */ info->flags |= IEEE80211_TX_INTFL_RETRIED; ieee80211_add_pending_skb(local, skb); return; } ps_dbg_ratelimited(sta->sdata, "dropped TX filtered frame, queue_len=%d PS=%d @%lu\n", skb_queue_len(&sta->tx_filtered[ac]), !!test_sta_flag(sta, WLAN_STA_PS_STA), jiffies); ieee80211_free_txskb(&local->hw, skb); } static void ieee80211_check_pending_bar(struct sta_info *sta, u8 *addr, u8 tid) { struct tid_ampdu_tx *tid_tx; tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]); if (!tid_tx || !tid_tx->bar_pending) return; tid_tx->bar_pending = false; ieee80211_send_bar(&sta->sdata->vif, addr, tid, tid_tx->failed_bar_ssn); } static void ieee80211_frame_acked(struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_mgmt *mgmt = (void *) skb->data; if (ieee80211_is_data_qos(mgmt->frame_control)) { struct ieee80211_hdr *hdr = (void *) skb->data; u8 *qc = ieee80211_get_qos_ctl(hdr); u16 tid = qc[0] & 0xf; ieee80211_check_pending_bar(sta, hdr->addr1, tid); } } static void ieee80211_set_bar_pending(struct sta_info *sta, u8 tid, u16 ssn) { struct tid_ampdu_tx *tid_tx; tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]); if (!tid_tx) return; tid_tx->failed_bar_ssn = ssn; tid_tx->bar_pending = true; } static int ieee80211_tx_radiotap_len(struct ieee80211_tx_info *info, struct ieee80211_tx_status *status) { struct ieee80211_rate_status *status_rate = NULL; int len = sizeof(struct ieee80211_radiotap_header); if (status && status->n_rates) status_rate = &status->rates[status->n_rates - 1]; /* IEEE80211_RADIOTAP_RATE rate */ if (status_rate && !(status_rate->rate_idx.flags & (RATE_INFO_FLAGS_MCS | RATE_INFO_FLAGS_DMG | RATE_INFO_FLAGS_EDMG | RATE_INFO_FLAGS_VHT_MCS | RATE_INFO_FLAGS_HE_MCS))) len += 2; else if (info->status.rates[0].idx >= 0 && !(info->status.rates[0].flags & (IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_VHT_MCS))) len += 2; /* IEEE80211_RADIOTAP_TX_FLAGS */ len += 2; /* IEEE80211_RADIOTAP_DATA_RETRIES */ len += 1; /* IEEE80211_RADIOTAP_MCS * IEEE80211_RADIOTAP_VHT */ if (status_rate) { if (status_rate->rate_idx.flags & RATE_INFO_FLAGS_MCS) len += 3; else if (status_rate->rate_idx.flags & RATE_INFO_FLAGS_VHT_MCS) len = ALIGN(len, 2) + 12; else if (status_rate->rate_idx.flags & RATE_INFO_FLAGS_HE_MCS) len = ALIGN(len, 2) + 12; } else if (info->status.rates[0].idx >= 0) { if (info->status.rates[0].flags & IEEE80211_TX_RC_MCS) len += 3; else if (info->status.rates[0].flags & IEEE80211_TX_RC_VHT_MCS) len = ALIGN(len, 2) + 12; } return len; } static void ieee80211_add_tx_radiotap_header(struct ieee80211_local *local, struct sk_buff *skb, int retry_count, int rtap_len, struct ieee80211_tx_status *status) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct ieee80211_radiotap_header *rthdr; struct ieee80211_rate_status *status_rate = NULL; unsigned char *pos; u16 legacy_rate = 0; u16 txflags; if (status && status->n_rates) status_rate = &status->rates[status->n_rates - 1]; rthdr = skb_push(skb, rtap_len); memset(rthdr, 0, rtap_len); rthdr->it_len = cpu_to_le16(rtap_len); rthdr->it_present = cpu_to_le32(BIT(IEEE80211_RADIOTAP_TX_FLAGS) | BIT(IEEE80211_RADIOTAP_DATA_RETRIES)); pos = (unsigned char *)(rthdr + 1); /* * XXX: Once radiotap gets the bitmap reset thing the vendor * extensions proposal contains, we can actually report * the whole set of tries we did. */ /* IEEE80211_RADIOTAP_RATE */ if (status_rate) { if (!(status_rate->rate_idx.flags & (RATE_INFO_FLAGS_MCS | RATE_INFO_FLAGS_DMG | RATE_INFO_FLAGS_EDMG | RATE_INFO_FLAGS_VHT_MCS | RATE_INFO_FLAGS_HE_MCS))) legacy_rate = status_rate->rate_idx.legacy; } else if (info->status.rates[0].idx >= 0 && !(info->status.rates[0].flags & (IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_VHT_MCS))) { struct ieee80211_supported_band *sband; sband = local->hw.wiphy->bands[info->band]; legacy_rate = sband->bitrates[info->status.rates[0].idx].bitrate; } if (legacy_rate) { rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_RATE)); *pos = DIV_ROUND_UP(legacy_rate, 5); /* padding for tx flags */ pos += 2; } /* IEEE80211_RADIOTAP_TX_FLAGS */ txflags = 0; if (!(info->flags & IEEE80211_TX_STAT_ACK) && !is_multicast_ether_addr(hdr->addr1)) txflags |= IEEE80211_RADIOTAP_F_TX_FAIL; if (info->status.rates[0].flags & IEEE80211_TX_RC_USE_CTS_PROTECT) txflags |= IEEE80211_RADIOTAP_F_TX_CTS; if (info->status.rates[0].flags & IEEE80211_TX_RC_USE_RTS_CTS) txflags |= IEEE80211_RADIOTAP_F_TX_RTS; put_unaligned_le16(txflags, pos); pos += 2; /* IEEE80211_RADIOTAP_DATA_RETRIES */ /* for now report the total retry_count */ *pos = retry_count; pos++; if (status_rate && (status_rate->rate_idx.flags & RATE_INFO_FLAGS_MCS)) { rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_MCS)); pos[0] = IEEE80211_RADIOTAP_MCS_HAVE_MCS | IEEE80211_RADIOTAP_MCS_HAVE_GI | IEEE80211_RADIOTAP_MCS_HAVE_BW; if (status_rate->rate_idx.flags & RATE_INFO_FLAGS_SHORT_GI) pos[1] |= IEEE80211_RADIOTAP_MCS_SGI; if (status_rate->rate_idx.bw == RATE_INFO_BW_40) pos[1] |= IEEE80211_RADIOTAP_MCS_BW_40; pos[2] = status_rate->rate_idx.mcs; pos += 3; } else if (status_rate && (status_rate->rate_idx.flags & RATE_INFO_FLAGS_VHT_MCS)) { u16 known = local->hw.radiotap_vht_details & (IEEE80211_RADIOTAP_VHT_KNOWN_GI | IEEE80211_RADIOTAP_VHT_KNOWN_BANDWIDTH); rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_VHT)); /* required alignment from rthdr */ pos = (u8 *)rthdr + ALIGN(pos - (u8 *)rthdr, 2); /* u16 known - IEEE80211_RADIOTAP_VHT_KNOWN_* */ put_unaligned_le16(known, pos); pos += 2; /* u8 flags - IEEE80211_RADIOTAP_VHT_FLAG_* */ if (status_rate->rate_idx.flags & RATE_INFO_FLAGS_SHORT_GI) *pos |= IEEE80211_RADIOTAP_VHT_FLAG_SGI; pos++; /* u8 bandwidth */ switch (status_rate->rate_idx.bw) { case RATE_INFO_BW_160: *pos = 11; break; case RATE_INFO_BW_80: *pos = 4; break; case RATE_INFO_BW_40: *pos = 1; break; default: *pos = 0; break; } pos++; /* u8 mcs_nss[4] */ *pos = (status_rate->rate_idx.mcs << 4) | status_rate->rate_idx.nss; pos += 4; /* u8 coding */ pos++; /* u8 group_id */ pos++; /* u16 partial_aid */ pos += 2; } else if (status_rate && (status_rate->rate_idx.flags & RATE_INFO_FLAGS_HE_MCS)) { struct ieee80211_radiotap_he *he; rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_HE)); /* required alignment from rthdr */ pos = (u8 *)rthdr + ALIGN(pos - (u8 *)rthdr, 2); he = (struct ieee80211_radiotap_he *)pos; he->data1 = cpu_to_le16(IEEE80211_RADIOTAP_HE_DATA1_FORMAT_SU | IEEE80211_RADIOTAP_HE_DATA1_DATA_MCS_KNOWN | IEEE80211_RADIOTAP_HE_DATA1_DATA_DCM_KNOWN | IEEE80211_RADIOTAP_HE_DATA1_BW_RU_ALLOC_KNOWN); he->data2 = cpu_to_le16(IEEE80211_RADIOTAP_HE_DATA2_GI_KNOWN); #define HE_PREP(f, val) le16_encode_bits(val, IEEE80211_RADIOTAP_HE_##f) he->data6 |= HE_PREP(DATA6_NSTS, status_rate->rate_idx.nss); #define CHECK_GI(s) \ BUILD_BUG_ON(IEEE80211_RADIOTAP_HE_DATA5_GI_##s != \ (int)NL80211_RATE_INFO_HE_GI_##s) CHECK_GI(0_8); CHECK_GI(1_6); CHECK_GI(3_2); he->data3 |= HE_PREP(DATA3_DATA_MCS, status_rate->rate_idx.mcs); he->data3 |= HE_PREP(DATA3_DATA_DCM, status_rate->rate_idx.he_dcm); he->data5 |= HE_PREP(DATA5_GI, status_rate->rate_idx.he_gi); switch (status_rate->rate_idx.bw) { case RATE_INFO_BW_20: he->data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_20MHZ); break; case RATE_INFO_BW_40: he->data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_40MHZ); break; case RATE_INFO_BW_80: he->data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_80MHZ); break; case RATE_INFO_BW_160: he->data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_160MHZ); break; case RATE_INFO_BW_HE_RU: #define CHECK_RU_ALLOC(s) \ BUILD_BUG_ON(IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_##s##T != \ NL80211_RATE_INFO_HE_RU_ALLOC_##s + 4) CHECK_RU_ALLOC(26); CHECK_RU_ALLOC(52); CHECK_RU_ALLOC(106); CHECK_RU_ALLOC(242); CHECK_RU_ALLOC(484); CHECK_RU_ALLOC(996); CHECK_RU_ALLOC(2x996); he->data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, status_rate->rate_idx.he_ru_alloc + 4); break; default: WARN_ONCE(1, "Invalid SU BW %d\n", status_rate->rate_idx.bw); } pos += sizeof(struct ieee80211_radiotap_he); } if (status_rate || info->status.rates[0].idx < 0) return; /* IEEE80211_RADIOTAP_MCS * IEEE80211_RADIOTAP_VHT */ if (info->status.rates[0].flags & IEEE80211_TX_RC_MCS) { rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_MCS)); pos[0] = IEEE80211_RADIOTAP_MCS_HAVE_MCS | IEEE80211_RADIOTAP_MCS_HAVE_GI | IEEE80211_RADIOTAP_MCS_HAVE_BW; if (info->status.rates[0].flags & IEEE80211_TX_RC_SHORT_GI) pos[1] |= IEEE80211_RADIOTAP_MCS_SGI; if (info->status.rates[0].flags & IEEE80211_TX_RC_40_MHZ_WIDTH) pos[1] |= IEEE80211_RADIOTAP_MCS_BW_40; if (info->status.rates[0].flags & IEEE80211_TX_RC_GREEN_FIELD) pos[1] |= IEEE80211_RADIOTAP_MCS_FMT_GF; pos[2] = info->status.rates[0].idx; pos += 3; } else if (info->status.rates[0].flags & IEEE80211_TX_RC_VHT_MCS) { u16 known = local->hw.radiotap_vht_details & (IEEE80211_RADIOTAP_VHT_KNOWN_GI | IEEE80211_RADIOTAP_VHT_KNOWN_BANDWIDTH); rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_VHT)); /* required alignment from rthdr */ pos = (u8 *)rthdr + ALIGN(pos - (u8 *)rthdr, 2); /* u16 known - IEEE80211_RADIOTAP_VHT_KNOWN_* */ put_unaligned_le16(known, pos); pos += 2; /* u8 flags - IEEE80211_RADIOTAP_VHT_FLAG_* */ if (info->status.rates[0].flags & IEEE80211_TX_RC_SHORT_GI) *pos |= IEEE80211_RADIOTAP_VHT_FLAG_SGI; pos++; /* u8 bandwidth */ if (info->status.rates[0].flags & IEEE80211_TX_RC_40_MHZ_WIDTH) *pos = 1; else if (info->status.rates[0].flags & IEEE80211_TX_RC_80_MHZ_WIDTH) *pos = 4; else if (info->status.rates[0].flags & IEEE80211_TX_RC_160_MHZ_WIDTH) *pos = 11; else /* IEEE80211_TX_RC_{20_MHZ_WIDTH,FIXME:DUP_DATA} */ *pos = 0; pos++; /* u8 mcs_nss[4] */ *pos = (ieee80211_rate_get_vht_mcs(&info->status.rates[0]) << 4) | ieee80211_rate_get_vht_nss(&info->status.rates[0]); pos += 4; /* u8 coding */ pos++; /* u8 group_id */ pos++; /* u16 partial_aid */ pos += 2; } } /* * Handles the tx for TDLS teardown frames. * If the frame wasn't ACKed by the peer - it will be re-sent through the AP */ static void ieee80211_tdls_td_tx_handle(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u32 flags) { struct sk_buff *teardown_skb; struct sk_buff *orig_teardown_skb; bool is_teardown = false; /* Get the teardown data we need and free the lock */ spin_lock(&sdata->u.mgd.teardown_lock); teardown_skb = sdata->u.mgd.teardown_skb; orig_teardown_skb = sdata->u.mgd.orig_teardown_skb; if ((skb == orig_teardown_skb) && teardown_skb) { sdata->u.mgd.teardown_skb = NULL; sdata->u.mgd.orig_teardown_skb = NULL; is_teardown = true; } spin_unlock(&sdata->u.mgd.teardown_lock); if (is_teardown) { /* This mechanism relies on being able to get ACKs */ WARN_ON(!ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)); /* Check if peer has ACKed */ if (flags & IEEE80211_TX_STAT_ACK) { dev_kfree_skb_any(teardown_skb); } else { tdls_dbg(sdata, "TDLS Resending teardown through AP\n"); ieee80211_subif_start_xmit(teardown_skb, skb->dev); } } } static struct ieee80211_sub_if_data * ieee80211_sdata_from_skb(struct ieee80211_local *local, struct sk_buff *skb) { struct ieee80211_sub_if_data *sdata; if (skb->dev) { list_for_each_entry_rcu(sdata, &local->interfaces, list) { if (!sdata->dev) continue; if (skb->dev == sdata->dev) return sdata; } return NULL; } return rcu_dereference(local->p2p_sdata); } static void ieee80211_report_ack_skb(struct ieee80211_local *local, struct sk_buff *orig_skb, bool acked, bool dropped, ktime_t ack_hwtstamp) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(orig_skb); struct sk_buff *skb; unsigned long flags; spin_lock_irqsave(&local->ack_status_lock, flags); skb = idr_remove(&local->ack_status_frames, info->status_data); spin_unlock_irqrestore(&local->ack_status_lock, flags); if (!skb) return; if (info->flags & IEEE80211_TX_INTFL_NL80211_FRAME_TX) { u64 cookie = IEEE80211_SKB_CB(skb)->ack.cookie; struct ieee80211_sub_if_data *sdata; struct ieee80211_hdr *hdr = (void *)skb->data; bool is_valid_ack_signal = !!(info->status.flags & IEEE80211_TX_STATUS_ACK_SIGNAL_VALID); struct cfg80211_tx_status status = { .cookie = cookie, .buf = skb->data, .len = skb->len, .ack = acked, }; if (ieee80211_is_timing_measurement(orig_skb) || ieee80211_is_ftm(orig_skb)) { status.tx_tstamp = ktime_to_ns(skb_hwtstamps(orig_skb)->hwtstamp); status.ack_tstamp = ktime_to_ns(ack_hwtstamp); } rcu_read_lock(); sdata = ieee80211_sdata_from_skb(local, skb); if (sdata) { if (skb->protocol == sdata->control_port_protocol || skb->protocol == cpu_to_be16(ETH_P_PREAUTH)) cfg80211_control_port_tx_status(&sdata->wdev, cookie, skb->data, skb->len, acked, GFP_ATOMIC); else if (ieee80211_is_any_nullfunc(hdr->frame_control)) cfg80211_probe_status(sdata->dev, hdr->addr1, cookie, acked, info->status.ack_signal, is_valid_ack_signal, GFP_ATOMIC); else if (ieee80211_is_mgmt(hdr->frame_control)) cfg80211_mgmt_tx_status_ext(&sdata->wdev, &status, GFP_ATOMIC); else pr_warn("Unknown status report in ack skb\n"); } rcu_read_unlock(); dev_kfree_skb_any(skb); } else if (dropped) { dev_kfree_skb_any(skb); } else { /* consumes skb */ skb_complete_wifi_ack(skb, acked); } } static void ieee80211_handle_smps_status(struct ieee80211_sub_if_data *sdata, bool acked, u16 status_data) { u16 sub_data = u16_get_bits(status_data, IEEE80211_STATUS_SUBDATA_MASK); enum ieee80211_smps_mode smps_mode = sub_data & 3; int link_id = (sub_data >> 2); struct ieee80211_link_data *link; if (!sdata || !ieee80211_sdata_running(sdata)) return; if (!acked) return; if (sdata->vif.type != NL80211_IFTYPE_STATION) return; if (WARN(link_id >= ARRAY_SIZE(sdata->link), "bad SMPS status link: %d\n", link_id)) return; link = rcu_dereference(sdata->link[link_id]); if (!link) return; /* * This update looks racy, but isn't, the only other place * updating this variable is in managed mode before assoc, * and we have to be associated to have a status from the * action frame TX, since we cannot send it while we're not * associated yet. */ link->smps_mode = smps_mode; wiphy_work_queue(sdata->local->hw.wiphy, &link->u.mgd.recalc_smps); } static void ieee80211_handle_teardown_ttlm_status(struct ieee80211_sub_if_data *sdata, bool acked) { if (!sdata || !ieee80211_sdata_running(sdata)) return; if (!acked) return; if (sdata->vif.type != NL80211_IFTYPE_STATION) return; wiphy_work_queue(sdata->local->hw.wiphy, &sdata->u.mgd.teardown_ttlm_work); } static void ieee80211_report_used_skb(struct ieee80211_local *local, struct sk_buff *skb, bool dropped, ktime_t ack_hwtstamp) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); u16 tx_time_est = ieee80211_info_get_tx_time_est(info); struct ieee80211_hdr *hdr = (void *)skb->data; bool acked = info->flags & IEEE80211_TX_STAT_ACK; if (dropped) acked = false; if (tx_time_est) { struct sta_info *sta; rcu_read_lock(); sta = sta_info_get_by_addrs(local, hdr->addr1, hdr->addr2); ieee80211_sta_update_pending_airtime(local, sta, skb_get_queue_mapping(skb), tx_time_est, true); rcu_read_unlock(); } if (info->flags & IEEE80211_TX_INTFL_MLME_CONN_TX) { struct ieee80211_sub_if_data *sdata; rcu_read_lock(); sdata = ieee80211_sdata_from_skb(local, skb); if (!sdata) { skb->dev = NULL; } else if (!dropped) { /* Check to see if packet is a TDLS teardown packet */ if (ieee80211_is_data(hdr->frame_control) && (ieee80211_get_tdls_action(skb) == WLAN_TDLS_TEARDOWN)) { ieee80211_tdls_td_tx_handle(local, sdata, skb, info->flags); } else if (ieee80211_s1g_is_twt_setup(skb)) { if (!acked) { struct sk_buff *qskb; qskb = skb_clone(skb, GFP_ATOMIC); if (qskb) { skb_queue_tail(&sdata->status_queue, qskb); wiphy_work_queue(local->hw.wiphy, &sdata->work); } } } else { ieee80211_mgd_conn_tx_status(sdata, hdr->frame_control, acked); } } rcu_read_unlock(); } else if (info->status_data_idr) { ieee80211_report_ack_skb(local, skb, acked, dropped, ack_hwtstamp); } else if (info->status_data) { struct ieee80211_sub_if_data *sdata; rcu_read_lock(); sdata = ieee80211_sdata_from_skb(local, skb); switch (u16_get_bits(info->status_data, IEEE80211_STATUS_TYPE_MASK)) { case IEEE80211_STATUS_TYPE_SMPS: ieee80211_handle_smps_status(sdata, acked, info->status_data); break; case IEEE80211_STATUS_TYPE_NEG_TTLM: ieee80211_handle_teardown_ttlm_status(sdata, acked); break; } rcu_read_unlock(); } if (!dropped && skb->destructor) { skb->wifi_acked_valid = 1; skb->wifi_acked = acked; } ieee80211_led_tx(local); if (skb_has_frag_list(skb)) { kfree_skb_list(skb_shinfo(skb)->frag_list); skb_shinfo(skb)->frag_list = NULL; } } /* * Use a static threshold for now, best value to be determined * by testing ... * Should it depend on: * - on # of retransmissions * - current throughput (higher value for higher tpt)? */ #define STA_LOST_PKT_THRESHOLD 50 #define STA_LOST_PKT_TIME HZ /* 1 sec since last ACK */ #define STA_LOST_TDLS_PKT_TIME (10*HZ) /* 10secs since last ACK */ static void ieee80211_lost_packet(struct sta_info *sta, struct ieee80211_tx_info *info) { unsigned long pkt_time = STA_LOST_PKT_TIME; unsigned int pkt_thr = STA_LOST_PKT_THRESHOLD; /* If driver relies on its own algorithm for station kickout, skip * mac80211 packet loss mechanism. */ if (ieee80211_hw_check(&sta->local->hw, REPORTS_LOW_ACK)) return; /* This packet was aggregated but doesn't carry status info */ if ((info->flags & IEEE80211_TX_CTL_AMPDU) && !(info->flags & IEEE80211_TX_STAT_AMPDU)) return; sta->deflink.status_stats.lost_packets++; if (sta->sta.tdls) { pkt_time = STA_LOST_TDLS_PKT_TIME; pkt_thr = STA_LOST_PKT_THRESHOLD; } /* * If we're in TDLS mode, make sure that all STA_LOST_PKT_THRESHOLD * of the last packets were lost, and that no ACK was received in the * last STA_LOST_TDLS_PKT_TIME ms, before triggering the CQM packet-loss * mechanism. * For non-TDLS, use STA_LOST_PKT_THRESHOLD and STA_LOST_PKT_TIME */ if (sta->deflink.status_stats.lost_packets < pkt_thr || !time_after(jiffies, sta->deflink.status_stats.last_pkt_time + pkt_time)) return; cfg80211_cqm_pktloss_notify(sta->sdata->dev, sta->sta.addr, sta->deflink.status_stats.lost_packets, GFP_ATOMIC); sta->deflink.status_stats.lost_packets = 0; } static int ieee80211_tx_get_rates(struct ieee80211_hw *hw, struct ieee80211_tx_info *info, int *retry_count) { int count = -1; int i; for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) { if ((info->flags & IEEE80211_TX_CTL_AMPDU) && !(info->flags & IEEE80211_TX_STAT_AMPDU)) { /* just the first aggr frame carry status info */ info->status.rates[i].idx = -1; info->status.rates[i].count = 0; break; } else if (info->status.rates[i].idx < 0) { break; } else if (i >= hw->max_report_rates) { /* the HW cannot have attempted that rate */ info->status.rates[i].idx = -1; info->status.rates[i].count = 0; break; } count += info->status.rates[i].count; } if (count < 0) count = 0; *retry_count = count; return i - 1; } void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb, int retry_count, struct ieee80211_tx_status *status) { struct sk_buff *skb2; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_sub_if_data *sdata; struct net_device *prev_dev = NULL; int rtap_len; /* send frame to monitor interfaces now */ rtap_len = ieee80211_tx_radiotap_len(info, status); if (WARN_ON_ONCE(skb_headroom(skb) < rtap_len)) { pr_err("ieee80211_tx_status: headroom too small\n"); dev_kfree_skb(skb); return; } ieee80211_add_tx_radiotap_header(local, skb, retry_count, rtap_len, status); /* XXX: is this sufficient for BPF? */ skb_reset_mac_header(skb); skb->ip_summed = CHECKSUM_UNNECESSARY; skb->pkt_type = PACKET_OTHERHOST; skb->protocol = htons(ETH_P_802_2); memset(skb->cb, 0, sizeof(skb->cb)); rcu_read_lock(); list_for_each_entry_rcu(sdata, &local->interfaces, list) { if (sdata->vif.type == NL80211_IFTYPE_MONITOR) { if (!ieee80211_sdata_running(sdata)) continue; if (sdata->u.mntr.flags & MONITOR_FLAG_SKIP_TX) continue; if (prev_dev) { skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) { skb2->dev = prev_dev; netif_rx(skb2); } } prev_dev = sdata->dev; } } if (prev_dev) { skb->dev = prev_dev; netif_rx(skb); skb = NULL; } rcu_read_unlock(); dev_kfree_skb(skb); } static void __ieee80211_tx_status(struct ieee80211_hw *hw, struct ieee80211_tx_status *status, int rates_idx, int retry_count) { struct sk_buff *skb = status->skb; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_tx_info *info = status->info; struct sta_info *sta; __le16 fc; bool acked; bool noack_success; struct ieee80211_bar *bar; int tid = IEEE80211_NUM_TIDS; fc = hdr->frame_control; if (status->sta) { sta = container_of(status->sta, struct sta_info, sta); if (info->flags & IEEE80211_TX_STATUS_EOSP) clear_sta_flag(sta, WLAN_STA_SP); acked = !!(info->flags & IEEE80211_TX_STAT_ACK); noack_success = !!(info->flags & IEEE80211_TX_STAT_NOACK_TRANSMITTED); /* mesh Peer Service Period support */ if (ieee80211_vif_is_mesh(&sta->sdata->vif) && ieee80211_is_data_qos(fc)) ieee80211_mpsp_trigger_process( ieee80211_get_qos_ctl(hdr), sta, true, acked); if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL) && (ieee80211_is_data(hdr->frame_control)) && (rates_idx != -1)) sta->deflink.tx_stats.last_rate = info->status.rates[rates_idx]; if ((info->flags & IEEE80211_TX_STAT_AMPDU_NO_BACK) && (ieee80211_is_data_qos(fc))) { u16 ssn; u8 *qc; qc = ieee80211_get_qos_ctl(hdr); tid = qc[0] & 0xf; ssn = ((le16_to_cpu(hdr->seq_ctrl) + 0x10) & IEEE80211_SCTL_SEQ); ieee80211_send_bar(&sta->sdata->vif, hdr->addr1, tid, ssn); } else if (ieee80211_is_data_qos(fc)) { u8 *qc = ieee80211_get_qos_ctl(hdr); tid = qc[0] & 0xf; } if (!acked && ieee80211_is_back_req(fc)) { u16 control; /* * BAR failed, store the last SSN and retry sending * the BAR when the next unicast transmission on the * same TID succeeds. */ bar = (struct ieee80211_bar *) skb->data; control = le16_to_cpu(bar->control); if (!(control & IEEE80211_BAR_CTRL_MULTI_TID)) { u16 ssn = le16_to_cpu(bar->start_seq_num); tid = (control & IEEE80211_BAR_CTRL_TID_INFO_MASK) >> IEEE80211_BAR_CTRL_TID_INFO_SHIFT; ieee80211_set_bar_pending(sta, tid, ssn); } } if (info->flags & IEEE80211_TX_STAT_TX_FILTERED) { ieee80211_handle_filtered_frame(local, sta, skb); return; } else if (ieee80211_is_data_present(fc)) { if (!acked && !noack_success) sta->deflink.status_stats.msdu_failed[tid]++; sta->deflink.status_stats.msdu_retries[tid] += retry_count; } if (!(info->flags & IEEE80211_TX_CTL_INJECTED) && acked) ieee80211_frame_acked(sta, skb); } /* SNMP counters * Fragments are passed to low-level drivers as separate skbs, so these * are actually fragments, not frames. Update frame counters only for * the first fragment of the frame. */ if ((info->flags & IEEE80211_TX_STAT_ACK) || (info->flags & IEEE80211_TX_STAT_NOACK_TRANSMITTED)) { if (ieee80211_is_first_frag(hdr->seq_ctrl)) { I802_DEBUG_INC(local->dot11TransmittedFrameCount); if (is_multicast_ether_addr(ieee80211_get_DA(hdr))) I802_DEBUG_INC(local->dot11MulticastTransmittedFrameCount); if (retry_count > 0) I802_DEBUG_INC(local->dot11RetryCount); if (retry_count > 1) I802_DEBUG_INC(local->dot11MultipleRetryCount); } /* This counter shall be incremented for an acknowledged MPDU * with an individual address in the address 1 field or an MPDU * with a multicast address in the address 1 field of type Data * or Management. */ if (!is_multicast_ether_addr(hdr->addr1) || ieee80211_is_data(fc) || ieee80211_is_mgmt(fc)) I802_DEBUG_INC(local->dot11TransmittedFragmentCount); } else { if (ieee80211_is_first_frag(hdr->seq_ctrl)) I802_DEBUG_INC(local->dot11FailedCount); } if (ieee80211_is_any_nullfunc(fc) && ieee80211_has_pm(fc) && ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS) && !(info->flags & IEEE80211_TX_CTL_INJECTED) && local->ps_sdata && !(local->scanning)) { if (info->flags & IEEE80211_TX_STAT_ACK) local->ps_sdata->u.mgd.flags |= IEEE80211_STA_NULLFUNC_ACKED; mod_timer(&local->dynamic_ps_timer, jiffies + msecs_to_jiffies(10)); } ieee80211_report_used_skb(local, skb, false, status->ack_hwtstamp); /* * This is a bit racy but we can avoid a lot of work * with this test... */ if (local->tx_mntrs) ieee80211_tx_monitor(local, skb, retry_count, status); else if (status->free_list) list_add_tail(&skb->list, status->free_list); else dev_kfree_skb(skb); } void ieee80211_tx_status_skb(struct ieee80211_hw *hw, struct sk_buff *skb) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_tx_status status = { .skb = skb, .info = IEEE80211_SKB_CB(skb), }; struct sta_info *sta; rcu_read_lock(); sta = sta_info_get_by_addrs(local, hdr->addr1, hdr->addr2); if (sta) status.sta = &sta->sta; ieee80211_tx_status_ext(hw, &status); rcu_read_unlock(); } EXPORT_SYMBOL(ieee80211_tx_status_skb); void ieee80211_tx_status_ext(struct ieee80211_hw *hw, struct ieee80211_tx_status *status) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_tx_info *info = status->info; struct ieee80211_sta *pubsta = status->sta; struct sk_buff *skb = status->skb; struct sta_info *sta = NULL; int rates_idx, retry_count; bool acked, noack_success, ack_signal_valid; u16 tx_time_est; if (pubsta) { sta = container_of(pubsta, struct sta_info, sta); if (status->n_rates) sta->deflink.tx_stats.last_rate_info = status->rates[status->n_rates - 1].rate_idx; } if (skb && (tx_time_est = ieee80211_info_get_tx_time_est(IEEE80211_SKB_CB(skb))) > 0) { /* Do this here to avoid the expensive lookup of the sta * in ieee80211_report_used_skb(). */ ieee80211_sta_update_pending_airtime(local, sta, skb_get_queue_mapping(skb), tx_time_est, true); ieee80211_info_set_tx_time_est(IEEE80211_SKB_CB(skb), 0); } if (!status->info) goto free; rates_idx = ieee80211_tx_get_rates(hw, info, &retry_count); acked = !!(info->flags & IEEE80211_TX_STAT_ACK); noack_success = !!(info->flags & IEEE80211_TX_STAT_NOACK_TRANSMITTED); ack_signal_valid = !!(info->status.flags & IEEE80211_TX_STATUS_ACK_SIGNAL_VALID); if (pubsta) { struct ieee80211_sub_if_data *sdata = sta->sdata; if (!acked && !noack_success) sta->deflink.status_stats.retry_failed++; sta->deflink.status_stats.retry_count += retry_count; if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { if (sdata->vif.type == NL80211_IFTYPE_STATION && skb && !(info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP)) ieee80211_sta_tx_notify(sdata, (void *) skb->data, acked, info->status.tx_time); if (acked) { sta->deflink.status_stats.last_ack = jiffies; if (sta->deflink.status_stats.lost_packets) sta->deflink.status_stats.lost_packets = 0; /* Track when last packet was ACKed */ sta->deflink.status_stats.last_pkt_time = jiffies; /* Reset connection monitor */ if (sdata->vif.type == NL80211_IFTYPE_STATION && unlikely(sdata->u.mgd.probe_send_count > 0)) sdata->u.mgd.probe_send_count = 0; if (ack_signal_valid) { sta->deflink.status_stats.last_ack_signal = (s8)info->status.ack_signal; sta->deflink.status_stats.ack_signal_filled = true; ewma_avg_signal_add(&sta->deflink.status_stats.avg_ack_signal, -info->status.ack_signal); } } else if (test_sta_flag(sta, WLAN_STA_PS_STA)) { /* * The STA is in power save mode, so assume * that this TX packet failed because of that. */ if (skb) ieee80211_handle_filtered_frame(local, sta, skb); return; } else if (noack_success) { /* nothing to do here, do not account as lost */ } else { ieee80211_lost_packet(sta, info); } } rate_control_tx_status(local, status); if (ieee80211_vif_is_mesh(&sta->sdata->vif)) ieee80211s_update_metric(local, sta, status); } if (skb && !(info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP)) return __ieee80211_tx_status(hw, status, rates_idx, retry_count); if (acked || noack_success) { I802_DEBUG_INC(local->dot11TransmittedFrameCount); if (!pubsta) I802_DEBUG_INC(local->dot11MulticastTransmittedFrameCount); if (retry_count > 0) I802_DEBUG_INC(local->dot11RetryCount); if (retry_count > 1) I802_DEBUG_INC(local->dot11MultipleRetryCount); } else { I802_DEBUG_INC(local->dot11FailedCount); } free: if (!skb) return; ieee80211_report_used_skb(local, skb, false, status->ack_hwtstamp); if (status->free_list) list_add_tail(&skb->list, status->free_list); else dev_kfree_skb(skb); } EXPORT_SYMBOL(ieee80211_tx_status_ext); void ieee80211_tx_rate_update(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, struct ieee80211_tx_info *info) { struct ieee80211_local *local = hw_to_local(hw); struct sta_info *sta = container_of(pubsta, struct sta_info, sta); struct ieee80211_tx_status status = { .info = info, .sta = pubsta, }; rate_control_tx_status(local, &status); if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) sta->deflink.tx_stats.last_rate = info->status.rates[0]; } EXPORT_SYMBOL(ieee80211_tx_rate_update); void ieee80211_report_low_ack(struct ieee80211_sta *pubsta, u32 num_packets) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); cfg80211_cqm_pktloss_notify(sta->sdata->dev, sta->sta.addr, num_packets, GFP_ATOMIC); } EXPORT_SYMBOL(ieee80211_report_low_ack); void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb) { struct ieee80211_local *local = hw_to_local(hw); ktime_t kt = ktime_set(0, 0); ieee80211_report_used_skb(local, skb, true, kt); dev_kfree_skb_any(skb); } EXPORT_SYMBOL(ieee80211_free_txskb); void ieee80211_purge_tx_queue(struct ieee80211_hw *hw, struct sk_buff_head *skbs) { struct sk_buff *skb; while ((skb = __skb_dequeue(skbs))) ieee80211_free_txskb(hw, skb); } EXPORT_SYMBOL(ieee80211_purge_tx_queue);
7 4 1 2 49 49 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2019 Facebook */ #include <linux/hash.h> #include <linux/bpf.h> #include <linux/filter.h> #include <linux/ftrace.h> #include <linux/rbtree_latch.h> #include <linux/perf_event.h> #include <linux/btf.h> #include <linux/rcupdate_trace.h> #include <linux/rcupdate_wait.h> #include <linux/static_call.h> #include <linux/bpf_verifier.h> #include <linux/bpf_lsm.h> #include <linux/delay.h> /* dummy _ops. The verifier will operate on target program's ops. */ const struct bpf_verifier_ops bpf_extension_verifier_ops = { }; const struct bpf_prog_ops bpf_extension_prog_ops = { }; /* btf_vmlinux has ~22k attachable functions. 1k htab is enough. */ #define TRAMPOLINE_HASH_BITS 10 #define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS) static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE]; /* serializes access to trampoline_table */ static DEFINE_MUTEX(trampoline_mutex); #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex); static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, enum ftrace_ops_cmd cmd) { struct bpf_trampoline *tr = ops->private; int ret = 0; if (cmd == FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF) { /* This is called inside register_ftrace_direct_multi(), so * tr->mutex is already locked. */ lockdep_assert_held_once(&tr->mutex); /* Instead of updating the trampoline here, we propagate * -EAGAIN to register_ftrace_direct(). Then we can * retry register_ftrace_direct() after updating the * trampoline. */ if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) && !(tr->flags & BPF_TRAMP_F_ORIG_STACK)) { if (WARN_ON_ONCE(tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY)) return -EBUSY; tr->flags |= BPF_TRAMP_F_SHARE_IPMODIFY; return -EAGAIN; } return 0; } /* The normal locking order is * tr->mutex => direct_mutex (ftrace.c) => ftrace_lock (ftrace.c) * * The following two commands are called from * * prepare_direct_functions_for_ipmodify * cleanup_direct_functions_after_ipmodify * * In both cases, direct_mutex is already locked. Use * mutex_trylock(&tr->mutex) to avoid deadlock in race condition * (something else is making changes to this same trampoline). */ if (!mutex_trylock(&tr->mutex)) { /* sleep 1 ms to make sure whatever holding tr->mutex makes * some progress. */ msleep(1); return -EAGAIN; } switch (cmd) { case FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER: tr->flags |= BPF_TRAMP_F_SHARE_IPMODIFY; if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) && !(tr->flags & BPF_TRAMP_F_ORIG_STACK)) ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */); break; case FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER: tr->flags &= ~BPF_TRAMP_F_SHARE_IPMODIFY; if (tr->flags & BPF_TRAMP_F_ORIG_STACK) ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */); break; default: ret = -EINVAL; break; } mutex_unlock(&tr->mutex); return ret; } #endif bool bpf_prog_has_trampoline(const struct bpf_prog *prog) { enum bpf_attach_type eatype = prog->expected_attach_type; enum bpf_prog_type ptype = prog->type; return (ptype == BPF_PROG_TYPE_TRACING && (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT || eatype == BPF_MODIFY_RETURN)) || (ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC); } void bpf_image_ksym_init(void *data, unsigned int size, struct bpf_ksym *ksym) { ksym->start = (unsigned long) data; ksym->end = ksym->start + size; } void bpf_image_ksym_add(struct bpf_ksym *ksym) { bpf_ksym_add(ksym); perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start, PAGE_SIZE, false, ksym->name); } void bpf_image_ksym_del(struct bpf_ksym *ksym) { bpf_ksym_del(ksym); perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start, PAGE_SIZE, true, ksym->name); } static struct bpf_trampoline *bpf_trampoline_lookup(u64 key) { struct bpf_trampoline *tr; struct hlist_head *head; int i; mutex_lock(&trampoline_mutex); head = &trampoline_table[hash_64(key, TRAMPOLINE_HASH_BITS)]; hlist_for_each_entry(tr, head, hlist) { if (tr->key == key) { refcount_inc(&tr->refcnt); goto out; } } tr = kzalloc(sizeof(*tr), GFP_KERNEL); if (!tr) goto out; #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS tr->fops = kzalloc(sizeof(struct ftrace_ops), GFP_KERNEL); if (!tr->fops) { kfree(tr); tr = NULL; goto out; } tr->fops->private = tr; tr->fops->ops_func = bpf_tramp_ftrace_ops_func; #endif tr->key = key; INIT_HLIST_NODE(&tr->hlist); hlist_add_head(&tr->hlist, head); refcount_set(&tr->refcnt, 1); mutex_init(&tr->mutex); for (i = 0; i < BPF_TRAMP_MAX; i++) INIT_HLIST_HEAD(&tr->progs_hlist[i]); out: mutex_unlock(&trampoline_mutex); return tr; } static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr) { void *ip = tr->func.addr; int ret; if (tr->func.ftrace_managed) ret = unregister_ftrace_direct(tr->fops, (long)old_addr, false); else ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL); return ret; } static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr, bool lock_direct_mutex) { void *ip = tr->func.addr; int ret; if (tr->func.ftrace_managed) { if (lock_direct_mutex) ret = modify_ftrace_direct(tr->fops, (long)new_addr); else ret = modify_ftrace_direct_nolock(tr->fops, (long)new_addr); } else { ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr); } return ret; } /* first time registering */ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) { void *ip = tr->func.addr; unsigned long faddr; int ret; faddr = ftrace_location((unsigned long)ip); if (faddr) { if (!tr->fops) return -ENOTSUPP; tr->func.ftrace_managed = true; } if (tr->func.ftrace_managed) { ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1); ret = register_ftrace_direct(tr->fops, (long)new_addr); } else { ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr); } return ret; } static struct bpf_tramp_links * bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_arg) { struct bpf_tramp_link *link; struct bpf_tramp_links *tlinks; struct bpf_tramp_link **links; int kind; *total = 0; tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL); if (!tlinks) return ERR_PTR(-ENOMEM); for (kind = 0; kind < BPF_TRAMP_MAX; kind++) { tlinks[kind].nr_links = tr->progs_cnt[kind]; *total += tr->progs_cnt[kind]; links = tlinks[kind].links; hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) { *ip_arg |= link->link.prog->call_get_func_ip; *links++ = link; } } return tlinks; } static void bpf_tramp_image_free(struct bpf_tramp_image *im) { bpf_image_ksym_del(&im->ksym); arch_free_bpf_trampoline(im->image, im->size); bpf_jit_uncharge_modmem(im->size); percpu_ref_exit(&im->pcref); kfree_rcu(im, rcu); } static void __bpf_tramp_image_put_deferred(struct work_struct *work) { struct bpf_tramp_image *im; im = container_of(work, struct bpf_tramp_image, work); bpf_tramp_image_free(im); } /* callback, fexit step 3 or fentry step 2 */ static void __bpf_tramp_image_put_rcu(struct rcu_head *rcu) { struct bpf_tramp_image *im; im = container_of(rcu, struct bpf_tramp_image, rcu); INIT_WORK(&im->work, __bpf_tramp_image_put_deferred); schedule_work(&im->work); } /* callback, fexit step 2. Called after percpu_ref_kill confirms. */ static void __bpf_tramp_image_release(struct percpu_ref *pcref) { struct bpf_tramp_image *im; im = container_of(pcref, struct bpf_tramp_image, pcref); call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu); } /* callback, fexit or fentry step 1 */ static void __bpf_tramp_image_put_rcu_tasks(struct rcu_head *rcu) { struct bpf_tramp_image *im; im = container_of(rcu, struct bpf_tramp_image, rcu); if (im->ip_after_call) /* the case of fmod_ret/fexit trampoline and CONFIG_PREEMPTION=y */ percpu_ref_kill(&im->pcref); else /* the case of fentry trampoline */ call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu); } static void bpf_tramp_image_put(struct bpf_tramp_image *im) { /* The trampoline image that calls original function is using: * rcu_read_lock_trace to protect sleepable bpf progs * rcu_read_lock to protect normal bpf progs * percpu_ref to protect trampoline itself * rcu tasks to protect trampoline asm not covered by percpu_ref * (which are few asm insns before __bpf_tramp_enter and * after __bpf_tramp_exit) * * The trampoline is unreachable before bpf_tramp_image_put(). * * First, patch the trampoline to avoid calling into fexit progs. * The progs will be freed even if the original function is still * executing or sleeping. * In case of CONFIG_PREEMPT=y use call_rcu_tasks() to wait on * first few asm instructions to execute and call into * __bpf_tramp_enter->percpu_ref_get. * Then use percpu_ref_kill to wait for the trampoline and the original * function to finish. * Then use call_rcu_tasks() to make sure few asm insns in * the trampoline epilogue are done as well. * * In !PREEMPT case the task that got interrupted in the first asm * insns won't go through an RCU quiescent state which the * percpu_ref_kill will be waiting for. Hence the first * call_rcu_tasks() is not necessary. */ if (im->ip_after_call) { int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_JUMP, NULL, im->ip_epilogue); WARN_ON(err); if (IS_ENABLED(CONFIG_TASKS_RCU)) call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu_tasks); else percpu_ref_kill(&im->pcref); return; } /* The trampoline without fexit and fmod_ret progs doesn't call original * function and doesn't use percpu_ref. * Use call_rcu_tasks_trace() to wait for sleepable progs to finish. * Then use call_rcu_tasks() to wait for the rest of trampoline asm * and normal progs. */ call_rcu_tasks_trace(&im->rcu, __bpf_tramp_image_put_rcu_tasks); } static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, int size) { struct bpf_tramp_image *im; struct bpf_ksym *ksym; void *image; int err = -ENOMEM; im = kzalloc(sizeof(*im), GFP_KERNEL); if (!im) goto out; err = bpf_jit_charge_modmem(size); if (err) goto out_free_im; im->size = size; err = -ENOMEM; im->image = image = arch_alloc_bpf_trampoline(size); if (!image) goto out_uncharge; err = percpu_ref_init(&im->pcref, __bpf_tramp_image_release, 0, GFP_KERNEL); if (err) goto out_free_image; ksym = &im->ksym; INIT_LIST_HEAD_RCU(&ksym->lnode); snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", key); bpf_image_ksym_init(image, size, ksym); bpf_image_ksym_add(ksym); return im; out_free_image: arch_free_bpf_trampoline(im->image, im->size); out_uncharge: bpf_jit_uncharge_modmem(size); out_free_im: kfree(im); out: return ERR_PTR(err); } static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex) { struct bpf_tramp_image *im; struct bpf_tramp_links *tlinks; u32 orig_flags = tr->flags; bool ip_arg = false; int err, total, size; tlinks = bpf_trampoline_get_progs(tr, &total, &ip_arg); if (IS_ERR(tlinks)) return PTR_ERR(tlinks); if (total == 0) { err = unregister_fentry(tr, tr->cur_image->image); bpf_tramp_image_put(tr->cur_image); tr->cur_image = NULL; goto out; } /* clear all bits except SHARE_IPMODIFY and TAIL_CALL_CTX */ tr->flags &= (BPF_TRAMP_F_SHARE_IPMODIFY | BPF_TRAMP_F_TAIL_CALL_CTX); if (tlinks[BPF_TRAMP_FEXIT].nr_links || tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links) { /* NOTE: BPF_TRAMP_F_RESTORE_REGS and BPF_TRAMP_F_SKIP_FRAME * should not be set together. */ tr->flags |= BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; } else { tr->flags |= BPF_TRAMP_F_RESTORE_REGS; } if (ip_arg) tr->flags |= BPF_TRAMP_F_IP_ARG; #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS again: if ((tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY) && (tr->flags & BPF_TRAMP_F_CALL_ORIG)) tr->flags |= BPF_TRAMP_F_ORIG_STACK; #endif size = arch_bpf_trampoline_size(&tr->func.model, tr->flags, tlinks, tr->func.addr); if (size < 0) { err = size; goto out; } if (size > PAGE_SIZE) { err = -E2BIG; goto out; } im = bpf_tramp_image_alloc(tr->key, size); if (IS_ERR(im)) { err = PTR_ERR(im); goto out; } err = arch_prepare_bpf_trampoline(im, im->image, im->image + size, &tr->func.model, tr->flags, tlinks, tr->func.addr); if (err < 0) goto out_free; err = arch_protect_bpf_trampoline(im->image, im->size); if (err) goto out_free; WARN_ON(tr->cur_image && total == 0); if (tr->cur_image) /* progs already running at this address */ err = modify_fentry(tr, tr->cur_image->image, im->image, lock_direct_mutex); else /* first time registering */ err = register_fentry(tr, im->image); #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS if (err == -EAGAIN) { /* -EAGAIN from bpf_tramp_ftrace_ops_func. Now * BPF_TRAMP_F_SHARE_IPMODIFY is set, we can generate the * trampoline again, and retry register. */ /* reset fops->func and fops->trampoline for re-register */ tr->fops->func = NULL; tr->fops->trampoline = 0; /* free im memory and reallocate later */ bpf_tramp_image_free(im); goto again; } #endif if (err) goto out_free; if (tr->cur_image) bpf_tramp_image_put(tr->cur_image); tr->cur_image = im; out: /* If any error happens, restore previous flags */ if (err) tr->flags = orig_flags; kfree(tlinks); return err; out_free: bpf_tramp_image_free(im); goto out; } static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) { switch (prog->expected_attach_type) { case BPF_TRACE_FENTRY: return BPF_TRAMP_FENTRY; case BPF_MODIFY_RETURN: return BPF_TRAMP_MODIFY_RETURN; case BPF_TRACE_FEXIT: return BPF_TRAMP_FEXIT; case BPF_LSM_MAC: if (!prog->aux->attach_func_proto->type) /* The function returns void, we cannot modify its * return value. */ return BPF_TRAMP_FEXIT; else return BPF_TRAMP_MODIFY_RETURN; default: return BPF_TRAMP_REPLACE; } } static int bpf_freplace_check_tgt_prog(struct bpf_prog *tgt_prog) { struct bpf_prog_aux *aux = tgt_prog->aux; guard(mutex)(&aux->ext_mutex); if (aux->prog_array_member_cnt) /* Program extensions can not extend target prog when the target * prog has been updated to any prog_array map as tail callee. * It's to prevent a potential infinite loop like: * tgt prog entry -> tgt prog subprog -> freplace prog entry * --tailcall-> tgt prog entry. */ return -EBUSY; aux->is_extended = true; return 0; } static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog) { enum bpf_tramp_prog_type kind; struct bpf_tramp_link *link_exiting; int err = 0; int cnt = 0, i; kind = bpf_attach_type_to_tramp(link->link.prog); if (tr->extension_prog) /* cannot attach fentry/fexit if extension prog is attached. * cannot overwrite extension prog either. */ return -EBUSY; for (i = 0; i < BPF_TRAMP_MAX; i++) cnt += tr->progs_cnt[i]; if (kind == BPF_TRAMP_REPLACE) { /* Cannot attach extension if fentry/fexit are in use. */ if (cnt) return -EBUSY; err = bpf_freplace_check_tgt_prog(tgt_prog); if (err) return err; tr->extension_prog = link->link.prog; return bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL, link->link.prog->bpf_func); } if (cnt >= BPF_MAX_TRAMP_LINKS) return -E2BIG; if (!hlist_unhashed(&link->tramp_hlist)) /* prog already linked */ return -EBUSY; hlist_for_each_entry(link_exiting, &tr->progs_hlist[kind], tramp_hlist) { if (link_exiting->link.prog != link->link.prog) continue; /* prog already linked */ return -EBUSY; } hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]); tr->progs_cnt[kind]++; err = bpf_trampoline_update(tr, true /* lock_direct_mutex */); if (err) { hlist_del_init(&link->tramp_hlist); tr->progs_cnt[kind]--; } return err; } int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog) { int err; mutex_lock(&tr->mutex); err = __bpf_trampoline_link_prog(link, tr, tgt_prog); mutex_unlock(&tr->mutex); return err; } static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog) { enum bpf_tramp_prog_type kind; int err; kind = bpf_attach_type_to_tramp(link->link.prog); if (kind == BPF_TRAMP_REPLACE) { WARN_ON_ONCE(!tr->extension_prog); err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, tr->extension_prog->bpf_func, NULL); tr->extension_prog = NULL; guard(mutex)(&tgt_prog->aux->ext_mutex); tgt_prog->aux->is_extended = false; return err; } hlist_del_init(&link->tramp_hlist); tr->progs_cnt[kind]--; return bpf_trampoline_update(tr, true /* lock_direct_mutex */); } /* bpf_trampoline_unlink_prog() should never fail. */ int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog) { int err; mutex_lock(&tr->mutex); err = __bpf_trampoline_unlink_prog(link, tr, tgt_prog); mutex_unlock(&tr->mutex); return err; } #if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_LSM) static void bpf_shim_tramp_link_release(struct bpf_link *link) { struct bpf_shim_tramp_link *shim_link = container_of(link, struct bpf_shim_tramp_link, link.link); /* paired with 'shim_link->trampoline = tr' in bpf_trampoline_link_cgroup_shim */ if (!shim_link->trampoline) return; WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link, shim_link->trampoline, NULL)); bpf_trampoline_put(shim_link->trampoline); } static void bpf_shim_tramp_link_dealloc(struct bpf_link *link) { struct bpf_shim_tramp_link *shim_link = container_of(link, struct bpf_shim_tramp_link, link.link); kfree(shim_link); } static const struct bpf_link_ops bpf_shim_tramp_link_lops = { .release = bpf_shim_tramp_link_release, .dealloc = bpf_shim_tramp_link_dealloc, }; static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog, bpf_func_t bpf_func, int cgroup_atype) { struct bpf_shim_tramp_link *shim_link = NULL; struct bpf_prog *p; shim_link = kzalloc(sizeof(*shim_link), GFP_USER); if (!shim_link) return NULL; p = bpf_prog_alloc(1, 0); if (!p) { kfree(shim_link); return NULL; } p->jited = false; p->bpf_func = bpf_func; p->aux->cgroup_atype = cgroup_atype; p->aux->attach_func_proto = prog->aux->attach_func_proto; p->aux->attach_btf_id = prog->aux->attach_btf_id; p->aux->attach_btf = prog->aux->attach_btf; btf_get(p->aux->attach_btf); p->type = BPF_PROG_TYPE_LSM; p->expected_attach_type = BPF_LSM_MAC; bpf_prog_inc(p); bpf_link_init(&shim_link->link.link, BPF_LINK_TYPE_UNSPEC, &bpf_shim_tramp_link_lops, p); bpf_cgroup_atype_get(p->aux->attach_btf_id, cgroup_atype); return shim_link; } static struct bpf_shim_tramp_link *cgroup_shim_find(struct bpf_trampoline *tr, bpf_func_t bpf_func) { struct bpf_tramp_link *link; int kind; for (kind = 0; kind < BPF_TRAMP_MAX; kind++) { hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) { struct bpf_prog *p = link->link.prog; if (p->bpf_func == bpf_func) return container_of(link, struct bpf_shim_tramp_link, link); } } return NULL; } int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, int cgroup_atype) { struct bpf_shim_tramp_link *shim_link = NULL; struct bpf_attach_target_info tgt_info = {}; struct bpf_trampoline *tr; bpf_func_t bpf_func; u64 key; int err; err = bpf_check_attach_target(NULL, prog, NULL, prog->aux->attach_btf_id, &tgt_info); if (err) return err; key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, prog->aux->attach_btf_id); bpf_lsm_find_cgroup_shim(prog, &bpf_func); tr = bpf_trampoline_get(key, &tgt_info); if (!tr) return -ENOMEM; mutex_lock(&tr->mutex); shim_link = cgroup_shim_find(tr, bpf_func); if (shim_link) { /* Reusing existing shim attached by the other program. */ bpf_link_inc(&shim_link->link.link); mutex_unlock(&tr->mutex); bpf_trampoline_put(tr); /* bpf_trampoline_get above */ return 0; } /* Allocate and install new shim. */ shim_link = cgroup_shim_alloc(prog, bpf_func, cgroup_atype); if (!shim_link) { err = -ENOMEM; goto err; } err = __bpf_trampoline_link_prog(&shim_link->link, tr, NULL); if (err) goto err; shim_link->trampoline = tr; /* note, we're still holding tr refcnt from above */ mutex_unlock(&tr->mutex); return 0; err: mutex_unlock(&tr->mutex); if (shim_link) bpf_link_put(&shim_link->link.link); /* have to release tr while _not_ holding its mutex */ bpf_trampoline_put(tr); /* bpf_trampoline_get above */ return err; } void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog) { struct bpf_shim_tramp_link *shim_link = NULL; struct bpf_trampoline *tr; bpf_func_t bpf_func; u64 key; key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, prog->aux->attach_btf_id); bpf_lsm_find_cgroup_shim(prog, &bpf_func); tr = bpf_trampoline_lookup(key); if (WARN_ON_ONCE(!tr)) return; mutex_lock(&tr->mutex); shim_link = cgroup_shim_find(tr, bpf_func); mutex_unlock(&tr->mutex); if (shim_link) bpf_link_put(&shim_link->link.link); bpf_trampoline_put(tr); /* bpf_trampoline_lookup above */ } #endif struct bpf_trampoline *bpf_trampoline_get(u64 key, struct bpf_attach_target_info *tgt_info) { struct bpf_trampoline *tr; tr = bpf_trampoline_lookup(key); if (!tr) return NULL; mutex_lock(&tr->mutex); if (tr->func.addr) goto out; memcpy(&tr->func.model, &tgt_info->fmodel, sizeof(tgt_info->fmodel)); tr->func.addr = (void *)tgt_info->tgt_addr; out: mutex_unlock(&tr->mutex); return tr; } void bpf_trampoline_put(struct bpf_trampoline *tr) { int i; if (!tr) return; mutex_lock(&trampoline_mutex); if (!refcount_dec_and_test(&tr->refcnt)) goto out; WARN_ON_ONCE(mutex_is_locked(&tr->mutex)); for (i = 0; i < BPF_TRAMP_MAX; i++) if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[i]))) goto out; /* This code will be executed even when the last bpf_tramp_image * is alive. All progs are detached from the trampoline and the * trampoline image is patched with jmp into epilogue to skip * fexit progs. The fentry-only trampoline will be freed via * multiple rcu callbacks. */ hlist_del(&tr->hlist); if (tr->fops) { ftrace_free_filter(tr->fops); kfree(tr->fops); } kfree(tr); out: mutex_unlock(&trampoline_mutex); } #define NO_START_TIME 1 static __always_inline u64 notrace bpf_prog_start_time(void) { u64 start = NO_START_TIME; if (static_branch_unlikely(&bpf_stats_enabled_key)) { start = sched_clock(); if (unlikely(!start)) start = NO_START_TIME; } return start; } /* The logic is similar to bpf_prog_run(), but with an explicit * rcu_read_lock() and migrate_disable() which are required * for the trampoline. The macro is split into * call __bpf_prog_enter * call prog->bpf_func * call __bpf_prog_exit * * __bpf_prog_enter returns: * 0 - skip execution of the bpf prog * 1 - execute bpf prog * [2..MAX_U64] - execute bpf prog and record execution time. * This is start time. */ static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) __acquires(RCU) { rcu_read_lock(); migrate_disable(); run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { bpf_prog_inc_misses_counter(prog); if (prog->aux->recursion_detected) prog->aux->recursion_detected(prog); return 0; } return bpf_prog_start_time(); } static void notrace update_prog_stats(struct bpf_prog *prog, u64 start) { struct bpf_prog_stats *stats; if (static_branch_unlikely(&bpf_stats_enabled_key) && /* static_key could be enabled in __bpf_prog_enter* * and disabled in __bpf_prog_exit*. * And vice versa. * Hence check that 'start' is valid. */ start > NO_START_TIME) { u64 duration = sched_clock() - start; unsigned long flags; stats = this_cpu_ptr(prog->stats); flags = u64_stats_update_begin_irqsave(&stats->syncp); u64_stats_inc(&stats->cnt); u64_stats_add(&stats->nsecs, duration); u64_stats_update_end_irqrestore(&stats->syncp, flags); } } static void notrace __bpf_prog_exit_recur(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx) __releases(RCU) { bpf_reset_run_ctx(run_ctx->saved_run_ctx); update_prog_stats(prog, start); this_cpu_dec(*(prog->active)); migrate_enable(); rcu_read_unlock(); } static u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) __acquires(RCU) { /* Runtime stats are exported via actual BPF_LSM_CGROUP * programs, not the shims. */ rcu_read_lock(); migrate_disable(); run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); return NO_START_TIME; } static void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx) __releases(RCU) { bpf_reset_run_ctx(run_ctx->saved_run_ctx); migrate_enable(); rcu_read_unlock(); } u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) { rcu_read_lock_trace(); migrate_disable(); might_fault(); run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { bpf_prog_inc_misses_counter(prog); if (prog->aux->recursion_detected) prog->aux->recursion_detected(prog); return 0; } return bpf_prog_start_time(); } void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx) { bpf_reset_run_ctx(run_ctx->saved_run_ctx); update_prog_stats(prog, start); this_cpu_dec(*(prog->active)); migrate_enable(); rcu_read_unlock_trace(); } static u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) { rcu_read_lock_trace(); migrate_disable(); might_fault(); run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); return bpf_prog_start_time(); } static void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx) { bpf_reset_run_ctx(run_ctx->saved_run_ctx); update_prog_stats(prog, start); migrate_enable(); rcu_read_unlock_trace(); } static u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) __acquires(RCU) { rcu_read_lock(); migrate_disable(); run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); return bpf_prog_start_time(); } static void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx) __releases(RCU) { bpf_reset_run_ctx(run_ctx->saved_run_ctx); update_prog_stats(prog, start); migrate_enable(); rcu_read_unlock(); } void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr) { percpu_ref_get(&tr->pcref); } void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr) { percpu_ref_put(&tr->pcref); } bpf_trampoline_enter_t bpf_trampoline_enter(const struct bpf_prog *prog) { bool sleepable = prog->sleepable; if (bpf_prog_check_recur(prog)) return sleepable ? __bpf_prog_enter_sleepable_recur : __bpf_prog_enter_recur; if (resolve_prog_type(prog) == BPF_PROG_TYPE_LSM && prog->expected_attach_type == BPF_LSM_CGROUP) return __bpf_prog_enter_lsm_cgroup; return sleepable ? __bpf_prog_enter_sleepable : __bpf_prog_enter; } bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog) { bool sleepable = prog->sleepable; if (bpf_prog_check_recur(prog)) return sleepable ? __bpf_prog_exit_sleepable_recur : __bpf_prog_exit_recur; if (resolve_prog_type(prog) == BPF_PROG_TYPE_LSM && prog->expected_attach_type == BPF_LSM_CGROUP) return __bpf_prog_exit_lsm_cgroup; return sleepable ? __bpf_prog_exit_sleepable : __bpf_prog_exit; } int __weak arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, void *func_addr) { return -ENOTSUPP; } void * __weak arch_alloc_bpf_trampoline(unsigned int size) { void *image; if (WARN_ON_ONCE(size > PAGE_SIZE)) return NULL; image = bpf_jit_alloc_exec(PAGE_SIZE); if (image) set_vm_flush_reset_perms(image); return image; } void __weak arch_free_bpf_trampoline(void *image, unsigned int size) { WARN_ON_ONCE(size > PAGE_SIZE); /* bpf_jit_free_exec doesn't need "size", but * bpf_prog_pack_free() needs it. */ bpf_jit_free_exec(image); } int __weak arch_protect_bpf_trampoline(void *image, unsigned int size) { WARN_ON_ONCE(size > PAGE_SIZE); return set_memory_rox((long)image, 1); } int __weak arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, void *func_addr) { return -ENOTSUPP; } static int __init init_trampolines(void) { int i; for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++) INIT_HLIST_HEAD(&trampoline_table[i]); return 0; } late_initcall(init_trampolines);
2 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2005 Marc Kleine-Budde, Pengutronix * Copyright (C) 2006 Andrey Volkov, Varma Electronics * Copyright (C) 2008-2009 Wolfgang Grandegger <wg@grandegger.com> * Copyright (C) 2021 Vincent Mailhol <mailhol.vincent@wanadoo.fr> */ #include <linux/can/dev.h> #include <net/rtnetlink.h> static const struct nla_policy can_policy[IFLA_CAN_MAX + 1] = { [IFLA_CAN_STATE] = { .type = NLA_U32 }, [IFLA_CAN_CTRLMODE] = { .len = sizeof(struct can_ctrlmode) }, [IFLA_CAN_RESTART_MS] = { .type = NLA_U32 }, [IFLA_CAN_RESTART] = { .type = NLA_U32 }, [IFLA_CAN_BITTIMING] = { .len = sizeof(struct can_bittiming) }, [IFLA_CAN_BITTIMING_CONST] = { .len = sizeof(struct can_bittiming_const) }, [IFLA_CAN_CLOCK] = { .len = sizeof(struct can_clock) }, [IFLA_CAN_BERR_COUNTER] = { .len = sizeof(struct can_berr_counter) }, [IFLA_CAN_DATA_BITTIMING] = { .len = sizeof(struct can_bittiming) }, [IFLA_CAN_DATA_BITTIMING_CONST] = { .len = sizeof(struct can_bittiming_const) }, [IFLA_CAN_TERMINATION] = { .type = NLA_U16 }, [IFLA_CAN_TDC] = { .type = NLA_NESTED }, [IFLA_CAN_CTRLMODE_EXT] = { .type = NLA_NESTED }, }; static const struct nla_policy can_tdc_policy[IFLA_CAN_TDC_MAX + 1] = { [IFLA_CAN_TDC_TDCV_MIN] = { .type = NLA_U32 }, [IFLA_CAN_TDC_TDCV_MAX] = { .type = NLA_U32 }, [IFLA_CAN_TDC_TDCO_MIN] = { .type = NLA_U32 }, [IFLA_CAN_TDC_TDCO_MAX] = { .type = NLA_U32 }, [IFLA_CAN_TDC_TDCF_MIN] = { .type = NLA_U32 }, [IFLA_CAN_TDC_TDCF_MAX] = { .type = NLA_U32 }, [IFLA_CAN_TDC_TDCV] = { .type = NLA_U32 }, [IFLA_CAN_TDC_TDCO] = { .type = NLA_U32 }, [IFLA_CAN_TDC_TDCF] = { .type = NLA_U32 }, }; static int can_validate_bittiming(const struct can_bittiming *bt, struct netlink_ext_ack *extack) { /* sample point is in one-tenth of a percent */ if (bt->sample_point >= 1000) { NL_SET_ERR_MSG(extack, "sample point must be between 0 and 100%"); return -EINVAL; } return 0; } static int can_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { bool is_can_fd = false; int err; /* Make sure that valid CAN FD configurations always consist of * - nominal/arbitration bittiming * - data bittiming * - control mode with CAN_CTRLMODE_FD set * - TDC parameters are coherent (details below) */ if (!data) return 0; if (data[IFLA_CAN_CTRLMODE]) { struct can_ctrlmode *cm = nla_data(data[IFLA_CAN_CTRLMODE]); u32 tdc_flags = cm->flags & CAN_CTRLMODE_TDC_MASK; is_can_fd = cm->flags & cm->mask & CAN_CTRLMODE_FD; /* CAN_CTRLMODE_TDC_{AUTO,MANUAL} are mutually exclusive */ if (tdc_flags == CAN_CTRLMODE_TDC_MASK) return -EOPNOTSUPP; /* If one of the CAN_CTRLMODE_TDC_* flag is set then * TDC must be set and vice-versa */ if (!!tdc_flags != !!data[IFLA_CAN_TDC]) return -EOPNOTSUPP; /* If providing TDC parameters, at least TDCO is * needed. TDCV is needed if and only if * CAN_CTRLMODE_TDC_MANUAL is set */ if (data[IFLA_CAN_TDC]) { struct nlattr *tb_tdc[IFLA_CAN_TDC_MAX + 1]; err = nla_parse_nested(tb_tdc, IFLA_CAN_TDC_MAX, data[IFLA_CAN_TDC], can_tdc_policy, extack); if (err) return err; if (tb_tdc[IFLA_CAN_TDC_TDCV]) { if (tdc_flags & CAN_CTRLMODE_TDC_AUTO) return -EOPNOTSUPP; } else { if (tdc_flags & CAN_CTRLMODE_TDC_MANUAL) return -EOPNOTSUPP; } if (!tb_tdc[IFLA_CAN_TDC_TDCO]) return -EOPNOTSUPP; } } if (data[IFLA_CAN_BITTIMING]) { struct can_bittiming bt; memcpy(&bt, nla_data(data[IFLA_CAN_BITTIMING]), sizeof(bt)); err = can_validate_bittiming(&bt, extack); if (err) return err; } if (is_can_fd) { if (!data[IFLA_CAN_BITTIMING] || !data[IFLA_CAN_DATA_BITTIMING]) return -EOPNOTSUPP; } if (data[IFLA_CAN_DATA_BITTIMING] || data[IFLA_CAN_TDC]) { if (!is_can_fd) return -EOPNOTSUPP; } if (data[IFLA_CAN_DATA_BITTIMING]) { struct can_bittiming bt; memcpy(&bt, nla_data(data[IFLA_CAN_DATA_BITTIMING]), sizeof(bt)); err = can_validate_bittiming(&bt, extack); if (err) return err; } return 0; } static int can_tdc_changelink(struct can_priv *priv, const struct nlattr *nla, struct netlink_ext_ack *extack) { struct nlattr *tb_tdc[IFLA_CAN_TDC_MAX + 1]; struct can_tdc tdc = { 0 }; const struct can_tdc_const *tdc_const = priv->tdc_const; int err; if (!tdc_const || !can_tdc_is_enabled(priv)) return -EOPNOTSUPP; err = nla_parse_nested(tb_tdc, IFLA_CAN_TDC_MAX, nla, can_tdc_policy, extack); if (err) return err; if (tb_tdc[IFLA_CAN_TDC_TDCV]) { u32 tdcv = nla_get_u32(tb_tdc[IFLA_CAN_TDC_TDCV]); if (tdcv < tdc_const->tdcv_min || tdcv > tdc_const->tdcv_max) return -EINVAL; tdc.tdcv = tdcv; } if (tb_tdc[IFLA_CAN_TDC_TDCO]) { u32 tdco = nla_get_u32(tb_tdc[IFLA_CAN_TDC_TDCO]); if (tdco < tdc_const->tdco_min || tdco > tdc_const->tdco_max) return -EINVAL; tdc.tdco = tdco; } if (tb_tdc[IFLA_CAN_TDC_TDCF]) { u32 tdcf = nla_get_u32(tb_tdc[IFLA_CAN_TDC_TDCF]); if (tdcf < tdc_const->tdcf_min || tdcf > tdc_const->tdcf_max) return -EINVAL; tdc.tdcf = tdcf; } priv->tdc = tdc; return 0; } static int can_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct can_priv *priv = netdev_priv(dev); u32 tdc_mask = 0; int err; /* We need synchronization with dev->stop() */ ASSERT_RTNL(); if (data[IFLA_CAN_CTRLMODE]) { struct can_ctrlmode *cm; u32 ctrlstatic; u32 maskedflags; /* Do not allow changing controller mode while running */ if (dev->flags & IFF_UP) return -EBUSY; cm = nla_data(data[IFLA_CAN_CTRLMODE]); ctrlstatic = can_get_static_ctrlmode(priv); maskedflags = cm->flags & cm->mask; /* check whether provided bits are allowed to be passed */ if (maskedflags & ~(priv->ctrlmode_supported | ctrlstatic)) return -EOPNOTSUPP; /* do not check for static fd-non-iso if 'fd' is disabled */ if (!(maskedflags & CAN_CTRLMODE_FD)) ctrlstatic &= ~CAN_CTRLMODE_FD_NON_ISO; /* make sure static options are provided by configuration */ if ((maskedflags & ctrlstatic) != ctrlstatic) return -EOPNOTSUPP; /* clear bits to be modified and copy the flag values */ priv->ctrlmode &= ~cm->mask; priv->ctrlmode |= maskedflags; /* CAN_CTRLMODE_FD can only be set when driver supports FD */ if (priv->ctrlmode & CAN_CTRLMODE_FD) { dev->mtu = CANFD_MTU; } else { dev->mtu = CAN_MTU; memset(&priv->data_bittiming, 0, sizeof(priv->data_bittiming)); priv->ctrlmode &= ~CAN_CTRLMODE_TDC_MASK; memset(&priv->tdc, 0, sizeof(priv->tdc)); } tdc_mask = cm->mask & CAN_CTRLMODE_TDC_MASK; /* CAN_CTRLMODE_TDC_{AUTO,MANUAL} are mutually * exclusive: make sure to turn the other one off */ if (tdc_mask) priv->ctrlmode &= cm->flags | ~CAN_CTRLMODE_TDC_MASK; } if (data[IFLA_CAN_BITTIMING]) { struct can_bittiming bt; /* Do not allow changing bittiming while running */ if (dev->flags & IFF_UP) return -EBUSY; /* Calculate bittiming parameters based on * bittiming_const if set, otherwise pass bitrate * directly via do_set_bitrate(). Bail out if neither * is given. */ if (!priv->bittiming_const && !priv->do_set_bittiming && !priv->bitrate_const) return -EOPNOTSUPP; memcpy(&bt, nla_data(data[IFLA_CAN_BITTIMING]), sizeof(bt)); err = can_get_bittiming(dev, &bt, priv->bittiming_const, priv->bitrate_const, priv->bitrate_const_cnt, extack); if (err) return err; if (priv->bitrate_max && bt.bitrate > priv->bitrate_max) { NL_SET_ERR_MSG_FMT(extack, "arbitration bitrate %u bps surpasses transceiver capabilities of %u bps", bt.bitrate, priv->bitrate_max); return -EINVAL; } memcpy(&priv->bittiming, &bt, sizeof(bt)); if (priv->do_set_bittiming) { /* Finally, set the bit-timing registers */ err = priv->do_set_bittiming(dev); if (err) return err; } } if (data[IFLA_CAN_RESTART_MS]) { /* Do not allow changing restart delay while running */ if (dev->flags & IFF_UP) return -EBUSY; priv->restart_ms = nla_get_u32(data[IFLA_CAN_RESTART_MS]); } if (data[IFLA_CAN_RESTART]) { /* Do not allow a restart while not running */ if (!(dev->flags & IFF_UP)) return -EINVAL; err = can_restart_now(dev); if (err) return err; } if (data[IFLA_CAN_DATA_BITTIMING]) { struct can_bittiming dbt; /* Do not allow changing bittiming while running */ if (dev->flags & IFF_UP) return -EBUSY; /* Calculate bittiming parameters based on * data_bittiming_const if set, otherwise pass bitrate * directly via do_set_bitrate(). Bail out if neither * is given. */ if (!priv->data_bittiming_const && !priv->do_set_data_bittiming && !priv->data_bitrate_const) return -EOPNOTSUPP; memcpy(&dbt, nla_data(data[IFLA_CAN_DATA_BITTIMING]), sizeof(dbt)); err = can_get_bittiming(dev, &dbt, priv->data_bittiming_const, priv->data_bitrate_const, priv->data_bitrate_const_cnt, extack); if (err) return err; if (priv->bitrate_max && dbt.bitrate > priv->bitrate_max) { NL_SET_ERR_MSG_FMT(extack, "CANFD data bitrate %u bps surpasses transceiver capabilities of %u bps", dbt.bitrate, priv->bitrate_max); return -EINVAL; } memset(&priv->tdc, 0, sizeof(priv->tdc)); if (data[IFLA_CAN_TDC]) { /* TDC parameters are provided: use them */ err = can_tdc_changelink(priv, data[IFLA_CAN_TDC], extack); if (err) { priv->ctrlmode &= ~CAN_CTRLMODE_TDC_MASK; return err; } } else if (!tdc_mask) { /* Neither of TDC parameters nor TDC flags are * provided: do calculation */ can_calc_tdco(&priv->tdc, priv->tdc_const, &dbt, &priv->ctrlmode, priv->ctrlmode_supported); } /* else: both CAN_CTRLMODE_TDC_{AUTO,MANUAL} are explicitly * turned off. TDC is disabled: do nothing */ memcpy(&priv->data_bittiming, &dbt, sizeof(dbt)); if (priv->do_set_data_bittiming) { /* Finally, set the bit-timing registers */ err = priv->do_set_data_bittiming(dev); if (err) return err; } } if (data[IFLA_CAN_TERMINATION]) { const u16 termval = nla_get_u16(data[IFLA_CAN_TERMINATION]); const unsigned int num_term = priv->termination_const_cnt; unsigned int i; if (!priv->do_set_termination) return -EOPNOTSUPP; /* check whether given value is supported by the interface */ for (i = 0; i < num_term; i++) { if (termval == priv->termination_const[i]) break; } if (i >= num_term) return -EINVAL; /* Finally, set the termination value */ err = priv->do_set_termination(dev, termval); if (err) return err; priv->termination = termval; } return 0; } static size_t can_tdc_get_size(const struct net_device *dev) { struct can_priv *priv = netdev_priv(dev); size_t size; if (!priv->tdc_const) return 0; size = nla_total_size(0); /* nest IFLA_CAN_TDC */ if (priv->ctrlmode_supported & CAN_CTRLMODE_TDC_MANUAL) { size += nla_total_size(sizeof(u32)); /* IFLA_CAN_TDCV_MIN */ size += nla_total_size(sizeof(u32)); /* IFLA_CAN_TDCV_MAX */ } size += nla_total_size(sizeof(u32)); /* IFLA_CAN_TDCO_MIN */ size += nla_total_size(sizeof(u32)); /* IFLA_CAN_TDCO_MAX */ if (priv->tdc_const->tdcf_max) { size += nla_total_size(sizeof(u32)); /* IFLA_CAN_TDCF_MIN */ size += nla_total_size(sizeof(u32)); /* IFLA_CAN_TDCF_MAX */ } if (can_tdc_is_enabled(priv)) { if (priv->ctrlmode & CAN_CTRLMODE_TDC_MANUAL || priv->do_get_auto_tdcv) size += nla_total_size(sizeof(u32)); /* IFLA_CAN_TDCV */ size += nla_total_size(sizeof(u32)); /* IFLA_CAN_TDCO */ if (priv->tdc_const->tdcf_max) size += nla_total_size(sizeof(u32)); /* IFLA_CAN_TDCF */ } return size; } static size_t can_ctrlmode_ext_get_size(void) { return nla_total_size(0) + /* nest IFLA_CAN_CTRLMODE_EXT */ nla_total_size(sizeof(u32)); /* IFLA_CAN_CTRLMODE_SUPPORTED */ } static size_t can_get_size(const struct net_device *dev) { struct can_priv *priv = netdev_priv(dev); size_t size = 0; if (priv->bittiming.bitrate) /* IFLA_CAN_BITTIMING */ size += nla_total_size(sizeof(struct can_bittiming)); if (priv->bittiming_const) /* IFLA_CAN_BITTIMING_CONST */ size += nla_total_size(sizeof(struct can_bittiming_const)); size += nla_total_size(sizeof(struct can_clock)); /* IFLA_CAN_CLOCK */ size += nla_total_size(sizeof(u32)); /* IFLA_CAN_STATE */ size += nla_total_size(sizeof(struct can_ctrlmode)); /* IFLA_CAN_CTRLMODE */ size += nla_total_size(sizeof(u32)); /* IFLA_CAN_RESTART_MS */ if (priv->do_get_berr_counter) /* IFLA_CAN_BERR_COUNTER */ size += nla_total_size(sizeof(struct can_berr_counter)); if (priv->data_bittiming.bitrate) /* IFLA_CAN_DATA_BITTIMING */ size += nla_total_size(sizeof(struct can_bittiming)); if (priv->data_bittiming_const) /* IFLA_CAN_DATA_BITTIMING_CONST */ size += nla_total_size(sizeof(struct can_bittiming_const)); if (priv->termination_const) { size += nla_total_size(sizeof(priv->termination)); /* IFLA_CAN_TERMINATION */ size += nla_total_size(sizeof(*priv->termination_const) * /* IFLA_CAN_TERMINATION_CONST */ priv->termination_const_cnt); } if (priv->bitrate_const) /* IFLA_CAN_BITRATE_CONST */ size += nla_total_size(sizeof(*priv->bitrate_const) * priv->bitrate_const_cnt); if (priv->data_bitrate_const) /* IFLA_CAN_DATA_BITRATE_CONST */ size += nla_total_size(sizeof(*priv->data_bitrate_const) * priv->data_bitrate_const_cnt); size += sizeof(priv->bitrate_max); /* IFLA_CAN_BITRATE_MAX */ size += can_tdc_get_size(dev); /* IFLA_CAN_TDC */ size += can_ctrlmode_ext_get_size(); /* IFLA_CAN_CTRLMODE_EXT */ return size; } static int can_tdc_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct nlattr *nest; struct can_priv *priv = netdev_priv(dev); struct can_tdc *tdc = &priv->tdc; const struct can_tdc_const *tdc_const = priv->tdc_const; if (!tdc_const) return 0; nest = nla_nest_start(skb, IFLA_CAN_TDC); if (!nest) return -EMSGSIZE; if (priv->ctrlmode_supported & CAN_CTRLMODE_TDC_MANUAL && (nla_put_u32(skb, IFLA_CAN_TDC_TDCV_MIN, tdc_const->tdcv_min) || nla_put_u32(skb, IFLA_CAN_TDC_TDCV_MAX, tdc_const->tdcv_max))) goto err_cancel; if (nla_put_u32(skb, IFLA_CAN_TDC_TDCO_MIN, tdc_const->tdco_min) || nla_put_u32(skb, IFLA_CAN_TDC_TDCO_MAX, tdc_const->tdco_max)) goto err_cancel; if (tdc_const->tdcf_max && (nla_put_u32(skb, IFLA_CAN_TDC_TDCF_MIN, tdc_const->tdcf_min) || nla_put_u32(skb, IFLA_CAN_TDC_TDCF_MAX, tdc_const->tdcf_max))) goto err_cancel; if (can_tdc_is_enabled(priv)) { u32 tdcv; int err = -EINVAL; if (priv->ctrlmode & CAN_CTRLMODE_TDC_MANUAL) { tdcv = tdc->tdcv; err = 0; } else if (priv->do_get_auto_tdcv) { err = priv->do_get_auto_tdcv(dev, &tdcv); } if (!err && nla_put_u32(skb, IFLA_CAN_TDC_TDCV, tdcv)) goto err_cancel; if (nla_put_u32(skb, IFLA_CAN_TDC_TDCO, tdc->tdco)) goto err_cancel; if (tdc_const->tdcf_max && nla_put_u32(skb, IFLA_CAN_TDC_TDCF, tdc->tdcf)) goto err_cancel; } nla_nest_end(skb, nest); return 0; err_cancel: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int can_ctrlmode_ext_fill_info(struct sk_buff *skb, const struct can_priv *priv) { struct nlattr *nest; nest = nla_nest_start(skb, IFLA_CAN_CTRLMODE_EXT); if (!nest) return -EMSGSIZE; if (nla_put_u32(skb, IFLA_CAN_CTRLMODE_SUPPORTED, priv->ctrlmode_supported)) { nla_nest_cancel(skb, nest); return -EMSGSIZE; } nla_nest_end(skb, nest); return 0; } static int can_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct can_priv *priv = netdev_priv(dev); struct can_ctrlmode cm = {.flags = priv->ctrlmode}; struct can_berr_counter bec = { }; enum can_state state = priv->state; if (priv->do_get_state) priv->do_get_state(dev, &state); if ((priv->bittiming.bitrate != CAN_BITRATE_UNSET && priv->bittiming.bitrate != CAN_BITRATE_UNKNOWN && nla_put(skb, IFLA_CAN_BITTIMING, sizeof(priv->bittiming), &priv->bittiming)) || (priv->bittiming_const && nla_put(skb, IFLA_CAN_BITTIMING_CONST, sizeof(*priv->bittiming_const), priv->bittiming_const)) || nla_put(skb, IFLA_CAN_CLOCK, sizeof(priv->clock), &priv->clock) || nla_put_u32(skb, IFLA_CAN_STATE, state) || nla_put(skb, IFLA_CAN_CTRLMODE, sizeof(cm), &cm) || nla_put_u32(skb, IFLA_CAN_RESTART_MS, priv->restart_ms) || (priv->do_get_berr_counter && !priv->do_get_berr_counter(dev, &bec) && nla_put(skb, IFLA_CAN_BERR_COUNTER, sizeof(bec), &bec)) || (priv->data_bittiming.bitrate && nla_put(skb, IFLA_CAN_DATA_BITTIMING, sizeof(priv->data_bittiming), &priv->data_bittiming)) || (priv->data_bittiming_const && nla_put(skb, IFLA_CAN_DATA_BITTIMING_CONST, sizeof(*priv->data_bittiming_const), priv->data_bittiming_const)) || (priv->termination_const && (nla_put_u16(skb, IFLA_CAN_TERMINATION, priv->termination) || nla_put(skb, IFLA_CAN_TERMINATION_CONST, sizeof(*priv->termination_const) * priv->termination_const_cnt, priv->termination_const))) || (priv->bitrate_const && nla_put(skb, IFLA_CAN_BITRATE_CONST, sizeof(*priv->bitrate_const) * priv->bitrate_const_cnt, priv->bitrate_const)) || (priv->data_bitrate_const && nla_put(skb, IFLA_CAN_DATA_BITRATE_CONST, sizeof(*priv->data_bitrate_const) * priv->data_bitrate_const_cnt, priv->data_bitrate_const)) || (nla_put(skb, IFLA_CAN_BITRATE_MAX, sizeof(priv->bitrate_max), &priv->bitrate_max)) || can_tdc_fill_info(skb, dev) || can_ctrlmode_ext_fill_info(skb, priv) ) return -EMSGSIZE; return 0; } static size_t can_get_xstats_size(const struct net_device *dev) { return sizeof(struct can_device_stats); } static int can_fill_xstats(struct sk_buff *skb, const struct net_device *dev) { struct can_priv *priv = netdev_priv(dev); if (nla_put(skb, IFLA_INFO_XSTATS, sizeof(priv->can_stats), &priv->can_stats)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static int can_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } static void can_dellink(struct net_device *dev, struct list_head *head) { } struct rtnl_link_ops can_link_ops __read_mostly = { .kind = "can", .netns_refund = true, .maxtype = IFLA_CAN_MAX, .policy = can_policy, .setup = can_setup, .validate = can_validate, .newlink = can_newlink, .changelink = can_changelink, .dellink = can_dellink, .get_size = can_get_size, .fill_info = can_fill_info, .get_xstats_size = can_get_xstats_size, .fill_xstats = can_fill_xstats, }; int can_netlink_register(void) { return rtnl_link_register(&can_link_ops); } void can_netlink_unregister(void) { rtnl_link_unregister(&can_link_ops); }
27 27 27 7 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 // SPDX-License-Identifier: GPL-2.0-or-later /* * xt_connmark - Netfilter module to operate on connection marks * * Copyright (C) 2002,2004 MARA Systems AB <https://www.marasystems.com> * by Henrik Nordstrom <hno@marasystems.com> * Copyright © CC Computer Consultants GmbH, 2007 - 2008 * Jan Engelhardt <jengelh@medozas.de> */ #include <linux/module.h> #include <linux/skbuff.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_connmark.h> MODULE_AUTHOR("Henrik Nordstrom <hno@marasystems.com>"); MODULE_DESCRIPTION("Xtables: connection mark operations"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_CONNMARK"); MODULE_ALIAS("ip6t_CONNMARK"); MODULE_ALIAS("ipt_connmark"); MODULE_ALIAS("ip6t_connmark"); static unsigned int connmark_tg_shift(struct sk_buff *skb, const struct xt_connmark_tginfo2 *info) { enum ip_conntrack_info ctinfo; u_int32_t new_targetmark; struct nf_conn *ct; u_int32_t newmark; u_int32_t oldmark; ct = nf_ct_get(skb, &ctinfo); if (ct == NULL) return XT_CONTINUE; switch (info->mode) { case XT_CONNMARK_SET: oldmark = READ_ONCE(ct->mark); newmark = (oldmark & ~info->ctmask) ^ info->ctmark; if (info->shift_dir == D_SHIFT_RIGHT) newmark >>= info->shift_bits; else newmark <<= info->shift_bits; if (READ_ONCE(ct->mark) != newmark) { WRITE_ONCE(ct->mark, newmark); nf_conntrack_event_cache(IPCT_MARK, ct); } break; case XT_CONNMARK_SAVE: new_targetmark = (skb->mark & info->nfmask); if (info->shift_dir == D_SHIFT_RIGHT) new_targetmark >>= info->shift_bits; else new_targetmark <<= info->shift_bits; newmark = (READ_ONCE(ct->mark) & ~info->ctmask) ^ new_targetmark; if (READ_ONCE(ct->mark) != newmark) { WRITE_ONCE(ct->mark, newmark); nf_conntrack_event_cache(IPCT_MARK, ct); } break; case XT_CONNMARK_RESTORE: new_targetmark = (READ_ONCE(ct->mark) & info->ctmask); if (info->shift_dir == D_SHIFT_RIGHT) new_targetmark >>= info->shift_bits; else new_targetmark <<= info->shift_bits; newmark = (skb->mark & ~info->nfmask) ^ new_targetmark; skb->mark = newmark; break; } return XT_CONTINUE; } static unsigned int connmark_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_connmark_tginfo1 *info = par->targinfo; const struct xt_connmark_tginfo2 info2 = { .ctmark = info->ctmark, .ctmask = info->ctmask, .nfmask = info->nfmask, .mode = info->mode, }; return connmark_tg_shift(skb, &info2); } static unsigned int connmark_tg_v2(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_connmark_tginfo2 *info = par->targinfo; return connmark_tg_shift(skb, info); } static int connmark_tg_check(const struct xt_tgchk_param *par) { int ret; ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) pr_info_ratelimited("cannot load conntrack support for proto=%u\n", par->family); return ret; } static void connmark_tg_destroy(const struct xt_tgdtor_param *par) { nf_ct_netns_put(par->net, par->family); } static bool connmark_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_connmark_mtinfo1 *info = par->matchinfo; enum ip_conntrack_info ctinfo; const struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); if (ct == NULL) return false; return ((READ_ONCE(ct->mark) & info->mask) == info->mark) ^ info->invert; } static int connmark_mt_check(const struct xt_mtchk_param *par) { int ret; ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) pr_info_ratelimited("cannot load conntrack support for proto=%u\n", par->family); return ret; } static void connmark_mt_destroy(const struct xt_mtdtor_param *par) { nf_ct_netns_put(par->net, par->family); } static struct xt_target connmark_tg_reg[] __read_mostly = { { .name = "CONNMARK", .revision = 1, .family = NFPROTO_IPV4, .checkentry = connmark_tg_check, .target = connmark_tg, .targetsize = sizeof(struct xt_connmark_tginfo1), .destroy = connmark_tg_destroy, .me = THIS_MODULE, }, { .name = "CONNMARK", .revision = 2, .family = NFPROTO_IPV4, .checkentry = connmark_tg_check, .target = connmark_tg_v2, .targetsize = sizeof(struct xt_connmark_tginfo2), .destroy = connmark_tg_destroy, .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "CONNMARK", .revision = 1, .family = NFPROTO_IPV6, .checkentry = connmark_tg_check, .target = connmark_tg, .targetsize = sizeof(struct xt_connmark_tginfo1), .destroy = connmark_tg_destroy, .me = THIS_MODULE, }, { .name = "CONNMARK", .revision = 2, .family = NFPROTO_IPV6, .checkentry = connmark_tg_check, .target = connmark_tg_v2, .targetsize = sizeof(struct xt_connmark_tginfo2), .destroy = connmark_tg_destroy, .me = THIS_MODULE, }, #endif }; static struct xt_match connmark_mt_reg __read_mostly = { .name = "connmark", .revision = 1, .family = NFPROTO_UNSPEC, .checkentry = connmark_mt_check, .match = connmark_mt, .matchsize = sizeof(struct xt_connmark_mtinfo1), .destroy = connmark_mt_destroy, .me = THIS_MODULE, }; static int __init connmark_mt_init(void) { int ret; ret = xt_register_targets(connmark_tg_reg, ARRAY_SIZE(connmark_tg_reg)); if (ret < 0) return ret; ret = xt_register_match(&connmark_mt_reg); if (ret < 0) { xt_unregister_targets(connmark_tg_reg, ARRAY_SIZE(connmark_tg_reg)); return ret; } return 0; } static void __exit connmark_mt_exit(void) { xt_unregister_match(&connmark_mt_reg); xt_unregister_targets(connmark_tg_reg, ARRAY_SIZE(connmark_tg_reg)); } module_init(connmark_mt_init); module_exit(connmark_mt_exit);
40 9 2 39 7 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 /* SPDX-License-Identifier: GPL-2.0-only */ #undef TRACE_SYSTEM #define TRACE_SYSTEM l2tp #if !defined(_TRACE_L2TP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_L2TP_H #include <linux/tracepoint.h> #include <linux/l2tp.h> #include "l2tp_core.h" #define encap_type_name(e) { L2TP_ENCAPTYPE_##e, #e } #define show_encap_type_name(val) \ __print_symbolic(val, \ encap_type_name(UDP), \ encap_type_name(IP)) #define pw_type_name(p) { L2TP_PWTYPE_##p, #p } #define show_pw_type_name(val) \ __print_symbolic(val, \ pw_type_name(ETH_VLAN), \ pw_type_name(ETH), \ pw_type_name(PPP), \ pw_type_name(PPP_AC), \ pw_type_name(IP)) DECLARE_EVENT_CLASS(tunnel_only_evt, TP_PROTO(struct l2tp_tunnel *tunnel), TP_ARGS(tunnel), TP_STRUCT__entry( __array(char, name, L2TP_TUNNEL_NAME_MAX) ), TP_fast_assign( memcpy(__entry->name, tunnel->name, L2TP_TUNNEL_NAME_MAX); ), TP_printk("%s", __entry->name) ); DECLARE_EVENT_CLASS(session_only_evt, TP_PROTO(struct l2tp_session *session), TP_ARGS(session), TP_STRUCT__entry( __array(char, name, L2TP_SESSION_NAME_MAX) ), TP_fast_assign( memcpy(__entry->name, session->name, L2TP_SESSION_NAME_MAX); ), TP_printk("%s", __entry->name) ); TRACE_EVENT(register_tunnel, TP_PROTO(struct l2tp_tunnel *tunnel), TP_ARGS(tunnel), TP_STRUCT__entry( __array(char, name, L2TP_TUNNEL_NAME_MAX) __field(int, fd) __field(u32, tid) __field(u32, ptid) __field(int, version) __field(enum l2tp_encap_type, encap) ), TP_fast_assign( memcpy(__entry->name, tunnel->name, L2TP_TUNNEL_NAME_MAX); __entry->fd = tunnel->fd; __entry->tid = tunnel->tunnel_id; __entry->ptid = tunnel->peer_tunnel_id; __entry->version = tunnel->version; __entry->encap = tunnel->encap; ), TP_printk("%s: type=%s encap=%s version=L2TPv%d tid=%u ptid=%u fd=%d", __entry->name, __entry->fd > 0 ? "managed" : "unmanaged", show_encap_type_name(__entry->encap), __entry->version, __entry->tid, __entry->ptid, __entry->fd) ); DEFINE_EVENT(tunnel_only_evt, delete_tunnel, TP_PROTO(struct l2tp_tunnel *tunnel), TP_ARGS(tunnel) ); DEFINE_EVENT(tunnel_only_evt, free_tunnel, TP_PROTO(struct l2tp_tunnel *tunnel), TP_ARGS(tunnel) ); TRACE_EVENT(register_session, TP_PROTO(struct l2tp_session *session), TP_ARGS(session), TP_STRUCT__entry( __array(char, name, L2TP_SESSION_NAME_MAX) __field(u32, tid) __field(u32, ptid) __field(u32, sid) __field(u32, psid) __field(enum l2tp_pwtype, pwtype) ), TP_fast_assign( memcpy(__entry->name, session->name, L2TP_SESSION_NAME_MAX); __entry->tid = session->tunnel ? session->tunnel->tunnel_id : 0; __entry->ptid = session->tunnel ? session->tunnel->peer_tunnel_id : 0; __entry->sid = session->session_id; __entry->psid = session->peer_session_id; __entry->pwtype = session->pwtype; ), TP_printk("%s: pseudowire=%s sid=%u psid=%u tid=%u ptid=%u", __entry->name, show_pw_type_name(__entry->pwtype), __entry->sid, __entry->psid, __entry->sid, __entry->psid) ); DEFINE_EVENT(session_only_evt, delete_session, TP_PROTO(struct l2tp_session *session), TP_ARGS(session) ); DEFINE_EVENT(session_only_evt, free_session, TP_PROTO(struct l2tp_session *session), TP_ARGS(session) ); DEFINE_EVENT(session_only_evt, session_seqnum_lns_enable, TP_PROTO(struct l2tp_session *session), TP_ARGS(session) ); DEFINE_EVENT(session_only_evt, session_seqnum_lns_disable, TP_PROTO(struct l2tp_session *session), TP_ARGS(session) ); DECLARE_EVENT_CLASS(session_seqnum_evt, TP_PROTO(struct l2tp_session *session), TP_ARGS(session), TP_STRUCT__entry( __array(char, name, L2TP_SESSION_NAME_MAX) __field(u32, ns) __field(u32, nr) ), TP_fast_assign( memcpy(__entry->name, session->name, L2TP_SESSION_NAME_MAX); __entry->ns = session->ns; __entry->nr = session->nr; ), TP_printk("%s: ns=%u nr=%u", __entry->name, __entry->ns, __entry->nr) ); DEFINE_EVENT(session_seqnum_evt, session_seqnum_update, TP_PROTO(struct l2tp_session *session), TP_ARGS(session) ); DEFINE_EVENT(session_seqnum_evt, session_seqnum_reset, TP_PROTO(struct l2tp_session *session), TP_ARGS(session) ); DECLARE_EVENT_CLASS(session_pkt_discard_evt, TP_PROTO(struct l2tp_session *session, u32 pkt_ns), TP_ARGS(session, pkt_ns), TP_STRUCT__entry( __array(char, name, L2TP_SESSION_NAME_MAX) __field(u32, pkt_ns) __field(u32, my_nr) __field(u32, reorder_q_len) ), TP_fast_assign( memcpy(__entry->name, session->name, L2TP_SESSION_NAME_MAX); __entry->pkt_ns = pkt_ns, __entry->my_nr = session->nr; __entry->reorder_q_len = skb_queue_len(&session->reorder_q); ), TP_printk("%s: pkt_ns=%u my_nr=%u reorder_q_len=%u", __entry->name, __entry->pkt_ns, __entry->my_nr, __entry->reorder_q_len) ); DEFINE_EVENT(session_pkt_discard_evt, session_pkt_expired, TP_PROTO(struct l2tp_session *session, u32 pkt_ns), TP_ARGS(session, pkt_ns) ); DEFINE_EVENT(session_pkt_discard_evt, session_pkt_outside_rx_window, TP_PROTO(struct l2tp_session *session, u32 pkt_ns), TP_ARGS(session, pkt_ns) ); DEFINE_EVENT(session_pkt_discard_evt, session_pkt_oos, TP_PROTO(struct l2tp_session *session, u32 pkt_ns), TP_ARGS(session, pkt_ns) ); #endif /* _TRACE_L2TP_H */ /* This part must be outside protection */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE trace #include <trace/define_trace.h>
8 8 8 8 8 8 8 8 8 8 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 /* mpiutil.ac - Utility functions for MPI * Copyright (C) 1998, 1999 Free Software Foundation, Inc. * * This file is part of GnuPG. * * GnuPG is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * GnuPG is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */ #include "mpi-internal.h" /**************** * Note: It was a bad idea to use the number of limbs to allocate * because on a alpha the limbs are large but we normally need * integers of n bits - So we should change this to bits (or bytes). * * But mpi_alloc is used in a lot of places :-) */ MPI mpi_alloc(unsigned nlimbs) { MPI a; a = kmalloc(sizeof *a, GFP_KERNEL); if (!a) return a; if (nlimbs) { a->d = mpi_alloc_limb_space(nlimbs); if (!a->d) { kfree(a); return NULL; } } else { a->d = NULL; } a->alloced = nlimbs; a->nlimbs = 0; a->sign = 0; a->flags = 0; a->nbits = 0; return a; } EXPORT_SYMBOL_GPL(mpi_alloc); mpi_ptr_t mpi_alloc_limb_space(unsigned nlimbs) { size_t len = nlimbs * sizeof(mpi_limb_t); if (!len) return NULL; return kmalloc(len, GFP_KERNEL); } void mpi_free_limb_space(mpi_ptr_t a) { if (!a) return; kfree_sensitive(a); } void mpi_assign_limb_space(MPI a, mpi_ptr_t ap, unsigned nlimbs) { mpi_free_limb_space(a->d); a->d = ap; a->alloced = nlimbs; } /**************** * Resize the array of A to NLIMBS. the additional space is cleared * (set to 0) [done by m_realloc()] */ int mpi_resize(MPI a, unsigned nlimbs) { void *p; if (nlimbs <= a->alloced) return 0; /* no need to do it */ if (a->d) { p = kcalloc(nlimbs, sizeof(mpi_limb_t), GFP_KERNEL); if (!p) return -ENOMEM; memcpy(p, a->d, a->alloced * sizeof(mpi_limb_t)); kfree_sensitive(a->d); a->d = p; } else { a->d = kcalloc(nlimbs, sizeof(mpi_limb_t), GFP_KERNEL); if (!a->d) return -ENOMEM; } a->alloced = nlimbs; return 0; } void mpi_free(MPI a) { if (!a) return; if (a->flags & 4) kfree_sensitive(a->d); else mpi_free_limb_space(a->d); if (a->flags & ~7) pr_info("invalid flag value in mpi\n"); kfree(a); } EXPORT_SYMBOL_GPL(mpi_free); /**************** * Note: This copy function should not interpret the MPI * but copy it transparently. */ MPI mpi_copy(MPI a) { int i; MPI b; if (a) { b = mpi_alloc(a->nlimbs); if (!b) return NULL; b->nlimbs = a->nlimbs; b->sign = a->sign; b->flags = a->flags; b->flags &= ~(16|32); /* Reset the immutable and constant flags. */ for (i = 0; i < b->nlimbs; i++) b->d[i] = a->d[i]; } else b = NULL; return b; } MODULE_DESCRIPTION("Multiprecision maths library"); MODULE_LICENSE("GPL");
1539 1541 1320 659 146 142 24 53 403 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause) /* Copyright (C) 2016-2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. * * SipHash: a fast short-input PRF * https://131002.net/siphash/ * * This implementation is specifically for SipHash2-4 for a secure PRF * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for * hashtables. */ #include <linux/siphash.h> #include <linux/unaligned.h> #if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 #include <linux/dcache.h> #include <asm/word-at-a-time.h> #endif #define SIPROUND SIPHASH_PERMUTATION(v0, v1, v2, v3) #define PREAMBLE(len) \ u64 v0 = SIPHASH_CONST_0; \ u64 v1 = SIPHASH_CONST_1; \ u64 v2 = SIPHASH_CONST_2; \ u64 v3 = SIPHASH_CONST_3; \ u64 b = ((u64)(len)) << 56; \ v3 ^= key->key[1]; \ v2 ^= key->key[0]; \ v1 ^= key->key[1]; \ v0 ^= key->key[0]; #define POSTAMBLE \ v3 ^= b; \ SIPROUND; \ SIPROUND; \ v0 ^= b; \ v2 ^= 0xff; \ SIPROUND; \ SIPROUND; \ SIPROUND; \ SIPROUND; \ return (v0 ^ v1) ^ (v2 ^ v3); #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u64)); const u8 left = len & (sizeof(u64) - 1); u64 m; PREAMBLE(len) for (; data != end; data += sizeof(u64)) { m = le64_to_cpup(data); v3 ^= m; SIPROUND; SIPROUND; v0 ^= m; } #if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 if (left) b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & bytemask_from_count(left))); #else switch (left) { case 7: b |= ((u64)end[6]) << 48; fallthrough; case 6: b |= ((u64)end[5]) << 40; fallthrough; case 5: b |= ((u64)end[4]) << 32; fallthrough; case 4: b |= le32_to_cpup(data); break; case 3: b |= ((u64)end[2]) << 16; fallthrough; case 2: b |= le16_to_cpup(data); break; case 1: b |= end[0]; } #endif POSTAMBLE } EXPORT_SYMBOL(__siphash_aligned); #endif u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u64)); const u8 left = len & (sizeof(u64) - 1); u64 m; PREAMBLE(len) for (; data != end; data += sizeof(u64)) { m = get_unaligned_le64(data); v3 ^= m; SIPROUND; SIPROUND; v0 ^= m; } #if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 if (left) b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & bytemask_from_count(left))); #else switch (left) { case 7: b |= ((u64)end[6]) << 48; fallthrough; case 6: b |= ((u64)end[5]) << 40; fallthrough; case 5: b |= ((u64)end[4]) << 32; fallthrough; case 4: b |= get_unaligned_le32(end); break; case 3: b |= ((u64)end[2]) << 16; fallthrough; case 2: b |= get_unaligned_le16(end); break; case 1: b |= end[0]; } #endif POSTAMBLE } EXPORT_SYMBOL(__siphash_unaligned); /** * siphash_1u64 - compute 64-bit siphash PRF value of a u64 * @first: first u64 * @key: the siphash key */ u64 siphash_1u64(const u64 first, const siphash_key_t *key) { PREAMBLE(8) v3 ^= first; SIPROUND; SIPROUND; v0 ^= first; POSTAMBLE } EXPORT_SYMBOL(siphash_1u64); /** * siphash_2u64 - compute 64-bit siphash PRF value of 2 u64 * @first: first u64 * @second: second u64 * @key: the siphash key */ u64 siphash_2u64(const u64 first, const u64 second, const siphash_key_t *key) { PREAMBLE(16) v3 ^= first; SIPROUND; SIPROUND; v0 ^= first; v3 ^= second; SIPROUND; SIPROUND; v0 ^= second; POSTAMBLE } EXPORT_SYMBOL(siphash_2u64); /** * siphash_3u64 - compute 64-bit siphash PRF value of 3 u64 * @first: first u64 * @second: second u64 * @third: third u64 * @key: the siphash key */ u64 siphash_3u64(const u64 first, const u64 second, const u64 third, const siphash_key_t *key) { PREAMBLE(24) v3 ^= first; SIPROUND; SIPROUND; v0 ^= first; v3 ^= second; SIPROUND; SIPROUND; v0 ^= second; v3 ^= third; SIPROUND; SIPROUND; v0 ^= third; POSTAMBLE } EXPORT_SYMBOL(siphash_3u64); /** * siphash_4u64 - compute 64-bit siphash PRF value of 4 u64 * @first: first u64 * @second: second u64 * @third: third u64 * @forth: forth u64 * @key: the siphash key */ u64 siphash_4u64(const u64 first, const u64 second, const u64 third, const u64 forth, const siphash_key_t *key) { PREAMBLE(32) v3 ^= first; SIPROUND; SIPROUND; v0 ^= first; v3 ^= second; SIPROUND; SIPROUND; v0 ^= second; v3 ^= third; SIPROUND; SIPROUND; v0 ^= third; v3 ^= forth; SIPROUND; SIPROUND; v0 ^= forth; POSTAMBLE } EXPORT_SYMBOL(siphash_4u64); u64 siphash_1u32(const u32 first, const siphash_key_t *key) { PREAMBLE(4) b |= first; POSTAMBLE } EXPORT_SYMBOL(siphash_1u32); u64 siphash_3u32(const u32 first, const u32 second, const u32 third, const siphash_key_t *key) { u64 combined = (u64)second << 32 | first; PREAMBLE(12) v3 ^= combined; SIPROUND; SIPROUND; v0 ^= combined; b |= third; POSTAMBLE } EXPORT_SYMBOL(siphash_3u32); #if BITS_PER_LONG == 64 /* Note that on 64-bit, we make HalfSipHash1-3 actually be SipHash1-3, for * performance reasons. On 32-bit, below, we actually implement HalfSipHash1-3. */ #define HSIPROUND SIPROUND #define HPREAMBLE(len) PREAMBLE(len) #define HPOSTAMBLE \ v3 ^= b; \ HSIPROUND; \ v0 ^= b; \ v2 ^= 0xff; \ HSIPROUND; \ HSIPROUND; \ HSIPROUND; \ return (v0 ^ v1) ^ (v2 ^ v3); #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u64)); const u8 left = len & (sizeof(u64) - 1); u64 m; HPREAMBLE(len) for (; data != end; data += sizeof(u64)) { m = le64_to_cpup(data); v3 ^= m; HSIPROUND; v0 ^= m; } #if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 if (left) b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & bytemask_from_count(left))); #else switch (left) { case 7: b |= ((u64)end[6]) << 48; fallthrough; case 6: b |= ((u64)end[5]) << 40; fallthrough; case 5: b |= ((u64)end[4]) << 32; fallthrough; case 4: b |= le32_to_cpup(data); break; case 3: b |= ((u64)end[2]) << 16; fallthrough; case 2: b |= le16_to_cpup(data); break; case 1: b |= end[0]; } #endif HPOSTAMBLE } EXPORT_SYMBOL(__hsiphash_aligned); #endif u32 __hsiphash_unaligned(const void *data, size_t len, const hsiphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u64)); const u8 left = len & (sizeof(u64) - 1); u64 m; HPREAMBLE(len) for (; data != end; data += sizeof(u64)) { m = get_unaligned_le64(data); v3 ^= m; HSIPROUND; v0 ^= m; } #if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 if (left) b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & bytemask_from_count(left))); #else switch (left) { case 7: b |= ((u64)end[6]) << 48; fallthrough; case 6: b |= ((u64)end[5]) << 40; fallthrough; case 5: b |= ((u64)end[4]) << 32; fallthrough; case 4: b |= get_unaligned_le32(end); break; case 3: b |= ((u64)end[2]) << 16; fallthrough; case 2: b |= get_unaligned_le16(end); break; case 1: b |= end[0]; } #endif HPOSTAMBLE } EXPORT_SYMBOL(__hsiphash_unaligned); /** * hsiphash_1u32 - compute 64-bit hsiphash PRF value of a u32 * @first: first u32 * @key: the hsiphash key */ u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key) { HPREAMBLE(4) b |= first; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_1u32); /** * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32 * @first: first u32 * @second: second u32 * @key: the hsiphash key */ u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key) { u64 combined = (u64)second << 32 | first; HPREAMBLE(8) v3 ^= combined; HSIPROUND; v0 ^= combined; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_2u32); /** * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32 * @first: first u32 * @second: second u32 * @third: third u32 * @key: the hsiphash key */ u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third, const hsiphash_key_t *key) { u64 combined = (u64)second << 32 | first; HPREAMBLE(12) v3 ^= combined; HSIPROUND; v0 ^= combined; b |= third; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_3u32); /** * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32 * @first: first u32 * @second: second u32 * @third: third u32 * @forth: forth u32 * @key: the hsiphash key */ u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third, const u32 forth, const hsiphash_key_t *key) { u64 combined = (u64)second << 32 | first; HPREAMBLE(16) v3 ^= combined; HSIPROUND; v0 ^= combined; combined = (u64)forth << 32 | third; v3 ^= combined; HSIPROUND; v0 ^= combined; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_4u32); #else #define HSIPROUND HSIPHASH_PERMUTATION(v0, v1, v2, v3) #define HPREAMBLE(len) \ u32 v0 = HSIPHASH_CONST_0; \ u32 v1 = HSIPHASH_CONST_1; \ u32 v2 = HSIPHASH_CONST_2; \ u32 v3 = HSIPHASH_CONST_3; \ u32 b = ((u32)(len)) << 24; \ v3 ^= key->key[1]; \ v2 ^= key->key[0]; \ v1 ^= key->key[1]; \ v0 ^= key->key[0]; #define HPOSTAMBLE \ v3 ^= b; \ HSIPROUND; \ v0 ^= b; \ v2 ^= 0xff; \ HSIPROUND; \ HSIPROUND; \ HSIPROUND; \ return v1 ^ v3; #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u32)); const u8 left = len & (sizeof(u32) - 1); u32 m; HPREAMBLE(len) for (; data != end; data += sizeof(u32)) { m = le32_to_cpup(data); v3 ^= m; HSIPROUND; v0 ^= m; } switch (left) { case 3: b |= ((u32)end[2]) << 16; fallthrough; case 2: b |= le16_to_cpup(data); break; case 1: b |= end[0]; } HPOSTAMBLE } EXPORT_SYMBOL(__hsiphash_aligned); #endif u32 __hsiphash_unaligned(const void *data, size_t len, const hsiphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u32)); const u8 left = len & (sizeof(u32) - 1); u32 m; HPREAMBLE(len) for (; data != end; data += sizeof(u32)) { m = get_unaligned_le32(data); v3 ^= m; HSIPROUND; v0 ^= m; } switch (left) { case 3: b |= ((u32)end[2]) << 16; fallthrough; case 2: b |= get_unaligned_le16(end); break; case 1: b |= end[0]; } HPOSTAMBLE } EXPORT_SYMBOL(__hsiphash_unaligned); /** * hsiphash_1u32 - compute 32-bit hsiphash PRF value of a u32 * @first: first u32 * @key: the hsiphash key */ u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key) { HPREAMBLE(4) v3 ^= first; HSIPROUND; v0 ^= first; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_1u32); /** * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32 * @first: first u32 * @second: second u32 * @key: the hsiphash key */ u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key) { HPREAMBLE(8) v3 ^= first; HSIPROUND; v0 ^= first; v3 ^= second; HSIPROUND; v0 ^= second; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_2u32); /** * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32 * @first: first u32 * @second: second u32 * @third: third u32 * @key: the hsiphash key */ u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third, const hsiphash_key_t *key) { HPREAMBLE(12) v3 ^= first; HSIPROUND; v0 ^= first; v3 ^= second; HSIPROUND; v0 ^= second; v3 ^= third; HSIPROUND; v0 ^= third; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_3u32); /** * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32 * @first: first u32 * @second: second u32 * @third: third u32 * @forth: forth u32 * @key: the hsiphash key */ u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third, const u32 forth, const hsiphash_key_t *key) { HPREAMBLE(16) v3 ^= first; HSIPROUND; v0 ^= first; v3 ^= second; HSIPROUND; v0 ^= second; v3 ^= third; HSIPROUND; v0 ^= third; v3 ^= forth; HSIPROUND; v0 ^= forth; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_4u32); #endif
12 18 18 7 1 12 7 12 12 17 17 7 3 5 2 12 5 12 114 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 // SPDX-License-Identifier: GPL-2.0-only /* * (C) 2007 Patrick McHardy <kaber@trash.net> */ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/gen_stats.h> #include <linux/jhash.h> #include <linux/rtnetlink.h> #include <linux/random.h> #include <linux/slab.h> #include <net/gen_stats.h> #include <net/netlink.h> #include <net/netns/generic.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_RATEEST.h> #include <net/netfilter/xt_rateest.h> #define RATEEST_HSIZE 16 struct xt_rateest_net { struct mutex hash_lock; struct hlist_head hash[RATEEST_HSIZE]; }; static unsigned int xt_rateest_id; static unsigned int jhash_rnd __read_mostly; static unsigned int xt_rateest_hash(const char *name) { return jhash(name, sizeof_field(struct xt_rateest, name), jhash_rnd) & (RATEEST_HSIZE - 1); } static void xt_rateest_hash_insert(struct xt_rateest_net *xn, struct xt_rateest *est) { unsigned int h; h = xt_rateest_hash(est->name); hlist_add_head(&est->list, &xn->hash[h]); } static struct xt_rateest *__xt_rateest_lookup(struct xt_rateest_net *xn, const char *name) { struct xt_rateest *est; unsigned int h; h = xt_rateest_hash(name); hlist_for_each_entry(est, &xn->hash[h], list) { if (strcmp(est->name, name) == 0) { est->refcnt++; return est; } } return NULL; } struct xt_rateest *xt_rateest_lookup(struct net *net, const char *name) { struct xt_rateest_net *xn = net_generic(net, xt_rateest_id); struct xt_rateest *est; mutex_lock(&xn->hash_lock); est = __xt_rateest_lookup(xn, name); mutex_unlock(&xn->hash_lock); return est; } EXPORT_SYMBOL_GPL(xt_rateest_lookup); void xt_rateest_put(struct net *net, struct xt_rateest *est) { struct xt_rateest_net *xn = net_generic(net, xt_rateest_id); mutex_lock(&xn->hash_lock); if (--est->refcnt == 0) { hlist_del(&est->list); gen_kill_estimator(&est->rate_est); /* * gen_estimator est_timer() might access est->lock or bstats, * wait a RCU grace period before freeing 'est' */ kfree_rcu(est, rcu); } mutex_unlock(&xn->hash_lock); } EXPORT_SYMBOL_GPL(xt_rateest_put); static unsigned int xt_rateest_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_rateest_target_info *info = par->targinfo; struct gnet_stats_basic_sync *stats = &info->est->bstats; spin_lock_bh(&info->est->lock); u64_stats_add(&stats->bytes, skb->len); u64_stats_inc(&stats->packets); spin_unlock_bh(&info->est->lock); return XT_CONTINUE; } static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) { struct xt_rateest_net *xn = net_generic(par->net, xt_rateest_id); struct xt_rateest_target_info *info = par->targinfo; struct xt_rateest *est; struct { struct nlattr opt; struct gnet_estimator est; } cfg; int ret; if (strnlen(info->name, sizeof(est->name)) >= sizeof(est->name)) return -ENAMETOOLONG; net_get_random_once(&jhash_rnd, sizeof(jhash_rnd)); mutex_lock(&xn->hash_lock); est = __xt_rateest_lookup(xn, info->name); if (est) { mutex_unlock(&xn->hash_lock); /* * If estimator parameters are specified, they must match the * existing estimator. */ if ((!info->interval && !info->ewma_log) || (info->interval != est->params.interval || info->ewma_log != est->params.ewma_log)) { xt_rateest_put(par->net, est); return -EINVAL; } info->est = est; return 0; } ret = -ENOMEM; est = kzalloc(sizeof(*est), GFP_KERNEL); if (!est) goto err1; gnet_stats_basic_sync_init(&est->bstats); strscpy(est->name, info->name, sizeof(est->name)); spin_lock_init(&est->lock); est->refcnt = 1; est->params.interval = info->interval; est->params.ewma_log = info->ewma_log; cfg.opt.nla_len = nla_attr_size(sizeof(cfg.est)); cfg.opt.nla_type = TCA_STATS_RATE_EST; cfg.est.interval = info->interval; cfg.est.ewma_log = info->ewma_log; ret = gen_new_estimator(&est->bstats, NULL, &est->rate_est, &est->lock, NULL, &cfg.opt); if (ret < 0) goto err2; info->est = est; xt_rateest_hash_insert(xn, est); mutex_unlock(&xn->hash_lock); return 0; err2: kfree(est); err1: mutex_unlock(&xn->hash_lock); return ret; } static void xt_rateest_tg_destroy(const struct xt_tgdtor_param *par) { struct xt_rateest_target_info *info = par->targinfo; xt_rateest_put(par->net, info->est); } static struct xt_target xt_rateest_tg_reg[] __read_mostly = { { .name = "RATEEST", .revision = 0, .family = NFPROTO_IPV4, .target = xt_rateest_tg, .checkentry = xt_rateest_tg_checkentry, .destroy = xt_rateest_tg_destroy, .targetsize = sizeof(struct xt_rateest_target_info), .usersize = offsetof(struct xt_rateest_target_info, est), .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "RATEEST", .revision = 0, .family = NFPROTO_IPV6, .target = xt_rateest_tg, .checkentry = xt_rateest_tg_checkentry, .destroy = xt_rateest_tg_destroy, .targetsize = sizeof(struct xt_rateest_target_info), .usersize = offsetof(struct xt_rateest_target_info, est), .me = THIS_MODULE, }, #endif }; static __net_init int xt_rateest_net_init(struct net *net) { struct xt_rateest_net *xn = net_generic(net, xt_rateest_id); int i; mutex_init(&xn->hash_lock); for (i = 0; i < ARRAY_SIZE(xn->hash); i++) INIT_HLIST_HEAD(&xn->hash[i]); return 0; } static struct pernet_operations xt_rateest_net_ops = { .init = xt_rateest_net_init, .id = &xt_rateest_id, .size = sizeof(struct xt_rateest_net), }; static int __init xt_rateest_tg_init(void) { int err = register_pernet_subsys(&xt_rateest_net_ops); if (err) return err; return xt_register_targets(xt_rateest_tg_reg, ARRAY_SIZE(xt_rateest_tg_reg)); } static void __exit xt_rateest_tg_fini(void) { xt_unregister_targets(xt_rateest_tg_reg, ARRAY_SIZE(xt_rateest_tg_reg)); unregister_pernet_subsys(&xt_rateest_net_ops); } MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Xtables: packet rate estimator"); MODULE_ALIAS("ipt_RATEEST"); MODULE_ALIAS("ip6t_RATEEST"); module_init(xt_rateest_tg_init); module_exit(xt_rateest_tg_fini);
8 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright 2019 Google LLC */ #ifndef __LINUX_BLK_CRYPTO_INTERNAL_H #define __LINUX_BLK_CRYPTO_INTERNAL_H #include <linux/bio.h> #include <linux/blk-mq.h> /* Represents a crypto mode supported by blk-crypto */ struct blk_crypto_mode { const char *name; /* name of this mode, shown in sysfs */ const char *cipher_str; /* crypto API name (for fallback case) */ unsigned int keysize; /* key size in bytes */ unsigned int security_strength; /* security strength in bytes */ unsigned int ivsize; /* iv size in bytes */ }; extern const struct blk_crypto_mode blk_crypto_modes[]; #ifdef CONFIG_BLK_INLINE_ENCRYPTION int blk_crypto_sysfs_register(struct gendisk *disk); void blk_crypto_sysfs_unregister(struct gendisk *disk); void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], unsigned int inc); bool bio_crypt_rq_ctx_compatible(struct request *rq, struct bio *bio); bool bio_crypt_ctx_mergeable(struct bio_crypt_ctx *bc1, unsigned int bc1_bytes, struct bio_crypt_ctx *bc2); static inline bool bio_crypt_ctx_back_mergeable(struct request *req, struct bio *bio) { return bio_crypt_ctx_mergeable(req->crypt_ctx, blk_rq_bytes(req), bio->bi_crypt_context); } static inline bool bio_crypt_ctx_front_mergeable(struct request *req, struct bio *bio) { return bio_crypt_ctx_mergeable(bio->bi_crypt_context, bio->bi_iter.bi_size, req->crypt_ctx); } static inline bool bio_crypt_ctx_merge_rq(struct request *req, struct request *next) { return bio_crypt_ctx_mergeable(req->crypt_ctx, blk_rq_bytes(req), next->crypt_ctx); } static inline void blk_crypto_rq_set_defaults(struct request *rq) { rq->crypt_ctx = NULL; rq->crypt_keyslot = NULL; } static inline bool blk_crypto_rq_is_encrypted(struct request *rq) { return rq->crypt_ctx; } static inline bool blk_crypto_rq_has_keyslot(struct request *rq) { return rq->crypt_keyslot; } blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile, const struct blk_crypto_key *key, struct blk_crypto_keyslot **slot_ptr); void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot); int __blk_crypto_evict_key(struct blk_crypto_profile *profile, const struct blk_crypto_key *key); bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile, const struct blk_crypto_config *cfg); int blk_crypto_ioctl(struct block_device *bdev, unsigned int cmd, void __user *argp); #else /* CONFIG_BLK_INLINE_ENCRYPTION */ static inline int blk_crypto_sysfs_register(struct gendisk *disk) { return 0; } static inline void blk_crypto_sysfs_unregister(struct gendisk *disk) { } static inline bool bio_crypt_rq_ctx_compatible(struct request *rq, struct bio *bio) { return true; } static inline bool bio_crypt_ctx_front_mergeable(struct request *req, struct bio *bio) { return true; } static inline bool bio_crypt_ctx_back_mergeable(struct request *req, struct bio *bio) { return true; } static inline bool bio_crypt_ctx_merge_rq(struct request *req, struct request *next) { return true; } static inline void blk_crypto_rq_set_defaults(struct request *rq) { } static inline bool blk_crypto_rq_is_encrypted(struct request *rq) { return false; } static inline bool blk_crypto_rq_has_keyslot(struct request *rq) { return false; } static inline int blk_crypto_ioctl(struct block_device *bdev, unsigned int cmd, void __user *argp) { return -ENOTTY; } #endif /* CONFIG_BLK_INLINE_ENCRYPTION */ void __bio_crypt_advance(struct bio *bio, unsigned int bytes); static inline void bio_crypt_advance(struct bio *bio, unsigned int bytes) { if (bio_has_crypt_ctx(bio)) __bio_crypt_advance(bio, bytes); } void __bio_crypt_free_ctx(struct bio *bio); static inline void bio_crypt_free_ctx(struct bio *bio) { if (bio_has_crypt_ctx(bio)) __bio_crypt_free_ctx(bio); } static inline void bio_crypt_do_front_merge(struct request *rq, struct bio *bio) { #ifdef CONFIG_BLK_INLINE_ENCRYPTION if (bio_has_crypt_ctx(bio)) memcpy(rq->crypt_ctx->bc_dun, bio->bi_crypt_context->bc_dun, sizeof(rq->crypt_ctx->bc_dun)); #endif } bool __blk_crypto_bio_prep(struct bio **bio_ptr); static inline bool blk_crypto_bio_prep(struct bio **bio_ptr) { if (bio_has_crypt_ctx(*bio_ptr)) return __blk_crypto_bio_prep(bio_ptr); return true; } blk_status_t __blk_crypto_rq_get_keyslot(struct request *rq); static inline blk_status_t blk_crypto_rq_get_keyslot(struct request *rq) { if (blk_crypto_rq_is_encrypted(rq)) return __blk_crypto_rq_get_keyslot(rq); return BLK_STS_OK; } void __blk_crypto_rq_put_keyslot(struct request *rq); static inline void blk_crypto_rq_put_keyslot(struct request *rq) { if (blk_crypto_rq_has_keyslot(rq)) __blk_crypto_rq_put_keyslot(rq); } void __blk_crypto_free_request(struct request *rq); static inline void blk_crypto_free_request(struct request *rq) { if (blk_crypto_rq_is_encrypted(rq)) __blk_crypto_free_request(rq); } int __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, gfp_t gfp_mask); /** * blk_crypto_rq_bio_prep - Prepare a request's crypt_ctx when its first bio * is inserted * @rq: The request to prepare * @bio: The first bio being inserted into the request * @gfp_mask: Memory allocation flags * * Return: 0 on success, -ENOMEM if out of memory. -ENOMEM is only possible if * @gfp_mask doesn't include %__GFP_DIRECT_RECLAIM. */ static inline int blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, gfp_t gfp_mask) { if (bio_has_crypt_ctx(bio)) return __blk_crypto_rq_bio_prep(rq, bio, gfp_mask); return 0; } #ifdef CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num); bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr); int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key); #else /* CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK */ static inline int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num) { pr_warn_once("crypto API fallback is disabled\n"); return -ENOPKG; } static inline bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr) { pr_warn_once("crypto API fallback disabled; failing request.\n"); (*bio_ptr)->bi_status = BLK_STS_NOTSUPP; return false; } static inline int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key) { return 0; } #endif /* CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK */ #endif /* __LINUX_BLK_CRYPTO_INTERNAL_H */
3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 // SPDX-License-Identifier: GPL-2.0 /* * NVMe over Fabrics RDMA target. * Copyright (c) 2015-2016 HGST, a Western Digital Company. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/atomic.h> #include <linux/blk-integrity.h> #include <linux/ctype.h> #include <linux/delay.h> #include <linux/err.h> #include <linux/init.h> #include <linux/module.h> #include <linux/nvme.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/wait.h> #include <linux/inet.h> #include <linux/unaligned.h> #include <rdma/ib_verbs.h> #include <rdma/rdma_cm.h> #include <rdma/rw.h> #include <rdma/ib_cm.h> #include <linux/nvme-rdma.h> #include "nvmet.h" /* * We allow at least 1 page, up to 4 SGEs, and up to 16KB of inline data */ #define NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE PAGE_SIZE #define NVMET_RDMA_MAX_INLINE_SGE 4 #define NVMET_RDMA_MAX_INLINE_DATA_SIZE max_t(int, SZ_16K, PAGE_SIZE) /* Assume mpsmin == device_page_size == 4KB */ #define NVMET_RDMA_MAX_MDTS 8 #define NVMET_RDMA_MAX_METADATA_MDTS 5 #define NVMET_RDMA_BACKLOG 128 #define NVMET_RDMA_DISCRETE_RSP_TAG -1 struct nvmet_rdma_srq; struct nvmet_rdma_cmd { struct ib_sge sge[NVMET_RDMA_MAX_INLINE_SGE + 1]; struct ib_cqe cqe; struct ib_recv_wr wr; struct scatterlist inline_sg[NVMET_RDMA_MAX_INLINE_SGE]; struct nvme_command *nvme_cmd; struct nvmet_rdma_queue *queue; struct nvmet_rdma_srq *nsrq; }; enum { NVMET_RDMA_REQ_INLINE_DATA = (1 << 0), }; struct nvmet_rdma_rsp { struct ib_sge send_sge; struct ib_cqe send_cqe; struct ib_send_wr send_wr; struct nvmet_rdma_cmd *cmd; struct nvmet_rdma_queue *queue; struct ib_cqe read_cqe; struct ib_cqe write_cqe; struct rdma_rw_ctx rw; struct nvmet_req req; bool allocated; u8 n_rdma; u32 flags; u32 invalidate_rkey; struct list_head wait_list; int tag; }; enum nvmet_rdma_queue_state { NVMET_RDMA_Q_CONNECTING, NVMET_RDMA_Q_LIVE, NVMET_RDMA_Q_DISCONNECTING, }; struct nvmet_rdma_queue { struct rdma_cm_id *cm_id; struct ib_qp *qp; struct nvmet_port *port; struct ib_cq *cq; atomic_t sq_wr_avail; struct nvmet_rdma_device *dev; struct nvmet_rdma_srq *nsrq; spinlock_t state_lock; enum nvmet_rdma_queue_state state; struct nvmet_cq nvme_cq; struct nvmet_sq nvme_sq; struct nvmet_rdma_rsp *rsps; struct sbitmap rsp_tags; struct nvmet_rdma_cmd *cmds; struct work_struct release_work; struct list_head rsp_wait_list; struct list_head rsp_wr_wait_list; spinlock_t rsp_wr_wait_lock; int idx; int host_qid; int comp_vector; int recv_queue_size; int send_queue_size; struct list_head queue_list; }; struct nvmet_rdma_port { struct nvmet_port *nport; struct sockaddr_storage addr; struct rdma_cm_id *cm_id; struct delayed_work repair_work; }; struct nvmet_rdma_srq { struct ib_srq *srq; struct nvmet_rdma_cmd *cmds; struct nvmet_rdma_device *ndev; }; struct nvmet_rdma_device { struct ib_device *device; struct ib_pd *pd; struct nvmet_rdma_srq **srqs; int srq_count; size_t srq_size; struct kref ref; struct list_head entry; int inline_data_size; int inline_page_count; }; static bool nvmet_rdma_use_srq; module_param_named(use_srq, nvmet_rdma_use_srq, bool, 0444); MODULE_PARM_DESC(use_srq, "Use shared receive queue."); static int srq_size_set(const char *val, const struct kernel_param *kp); static const struct kernel_param_ops srq_size_ops = { .set = srq_size_set, .get = param_get_int, }; static int nvmet_rdma_srq_size = 1024; module_param_cb(srq_size, &srq_size_ops, &nvmet_rdma_srq_size, 0644); MODULE_PARM_DESC(srq_size, "set Shared Receive Queue (SRQ) size, should >= 256 (default: 1024)"); static DEFINE_IDA(nvmet_rdma_queue_ida); static LIST_HEAD(nvmet_rdma_queue_list); static DEFINE_MUTEX(nvmet_rdma_queue_mutex); static LIST_HEAD(device_list); static DEFINE_MUTEX(device_list_mutex); static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp); static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc); static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc); static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc); static void nvmet_rdma_write_data_done(struct ib_cq *cq, struct ib_wc *wc); static void nvmet_rdma_qp_event(struct ib_event *event, void *priv); static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue); static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev, struct nvmet_rdma_rsp *r); static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev, struct nvmet_rdma_rsp *r, int tag); static const struct nvmet_fabrics_ops nvmet_rdma_ops; static int srq_size_set(const char *val, const struct kernel_param *kp) { int n = 0, ret; ret = kstrtoint(val, 10, &n); if (ret != 0 || n < 256) return -EINVAL; return param_set_int(val, kp); } static int num_pages(int len) { return 1 + (((len - 1) & PAGE_MASK) >> PAGE_SHIFT); } static inline bool nvmet_rdma_need_data_in(struct nvmet_rdma_rsp *rsp) { return nvme_is_write(rsp->req.cmd) && rsp->req.transfer_len && !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); } static inline bool nvmet_rdma_need_data_out(struct nvmet_rdma_rsp *rsp) { return !nvme_is_write(rsp->req.cmd) && rsp->req.transfer_len && !rsp->req.cqe->status && !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); } static inline struct nvmet_rdma_rsp * nvmet_rdma_get_rsp(struct nvmet_rdma_queue *queue) { struct nvmet_rdma_rsp *rsp = NULL; int tag; tag = sbitmap_get(&queue->rsp_tags); if (tag >= 0) rsp = &queue->rsps[tag]; if (unlikely(!rsp)) { int ret; rsp = kzalloc(sizeof(*rsp), GFP_KERNEL); if (unlikely(!rsp)) return NULL; ret = nvmet_rdma_alloc_rsp(queue->dev, rsp, NVMET_RDMA_DISCRETE_RSP_TAG); if (unlikely(ret)) { kfree(rsp); return NULL; } } return rsp; } static inline void nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp) { if (unlikely(rsp->tag == NVMET_RDMA_DISCRETE_RSP_TAG)) { nvmet_rdma_free_rsp(rsp->queue->dev, rsp); kfree(rsp); return; } sbitmap_clear_bit(&rsp->queue->rsp_tags, rsp->tag); } static void nvmet_rdma_free_inline_pages(struct nvmet_rdma_device *ndev, struct nvmet_rdma_cmd *c) { struct scatterlist *sg; struct ib_sge *sge; int i; if (!ndev->inline_data_size) return; sg = c->inline_sg; sge = &c->sge[1]; for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) { if (sge->length) ib_dma_unmap_page(ndev->device, sge->addr, sge->length, DMA_FROM_DEVICE); if (sg_page(sg)) __free_page(sg_page(sg)); } } static int nvmet_rdma_alloc_inline_pages(struct nvmet_rdma_device *ndev, struct nvmet_rdma_cmd *c) { struct scatterlist *sg; struct ib_sge *sge; struct page *pg; int len; int i; if (!ndev->inline_data_size) return 0; sg = c->inline_sg; sg_init_table(sg, ndev->inline_page_count); sge = &c->sge[1]; len = ndev->inline_data_size; for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) { pg = alloc_page(GFP_KERNEL); if (!pg) goto out_err; sg_assign_page(sg, pg); sge->addr = ib_dma_map_page(ndev->device, pg, 0, PAGE_SIZE, DMA_FROM_DEVICE); if (ib_dma_mapping_error(ndev->device, sge->addr)) goto out_err; sge->length = min_t(int, len, PAGE_SIZE); sge->lkey = ndev->pd->local_dma_lkey; len -= sge->length; } return 0; out_err: for (; i >= 0; i--, sg--, sge--) { if (sge->length) ib_dma_unmap_page(ndev->device, sge->addr, sge->length, DMA_FROM_DEVICE); if (sg_page(sg)) __free_page(sg_page(sg)); } return -ENOMEM; } static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev, struct nvmet_rdma_cmd *c, bool admin) { /* NVMe command / RDMA RECV */ c->nvme_cmd = kmalloc(sizeof(*c->nvme_cmd), GFP_KERNEL); if (!c->nvme_cmd) goto out; c->sge[0].addr = ib_dma_map_single(ndev->device, c->nvme_cmd, sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); if (ib_dma_mapping_error(ndev->device, c->sge[0].addr)) goto out_free_cmd; c->sge[0].length = sizeof(*c->nvme_cmd); c->sge[0].lkey = ndev->pd->local_dma_lkey; if (!admin && nvmet_rdma_alloc_inline_pages(ndev, c)) goto out_unmap_cmd; c->cqe.done = nvmet_rdma_recv_done; c->wr.wr_cqe = &c->cqe; c->wr.sg_list = c->sge; c->wr.num_sge = admin ? 1 : ndev->inline_page_count + 1; return 0; out_unmap_cmd: ib_dma_unmap_single(ndev->device, c->sge[0].addr, sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); out_free_cmd: kfree(c->nvme_cmd); out: return -ENOMEM; } static void nvmet_rdma_free_cmd(struct nvmet_rdma_device *ndev, struct nvmet_rdma_cmd *c, bool admin) { if (!admin) nvmet_rdma_free_inline_pages(ndev, c); ib_dma_unmap_single(ndev->device, c->sge[0].addr, sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); kfree(c->nvme_cmd); } static struct nvmet_rdma_cmd * nvmet_rdma_alloc_cmds(struct nvmet_rdma_device *ndev, int nr_cmds, bool admin) { struct nvmet_rdma_cmd *cmds; int ret = -EINVAL, i; cmds = kcalloc(nr_cmds, sizeof(struct nvmet_rdma_cmd), GFP_KERNEL); if (!cmds) goto out; for (i = 0; i < nr_cmds; i++) { ret = nvmet_rdma_alloc_cmd(ndev, cmds + i, admin); if (ret) goto out_free; } return cmds; out_free: while (--i >= 0) nvmet_rdma_free_cmd(ndev, cmds + i, admin); kfree(cmds); out: return ERR_PTR(ret); } static void nvmet_rdma_free_cmds(struct nvmet_rdma_device *ndev, struct nvmet_rdma_cmd *cmds, int nr_cmds, bool admin) { int i; for (i = 0; i < nr_cmds; i++) nvmet_rdma_free_cmd(ndev, cmds + i, admin); kfree(cmds); } static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev, struct nvmet_rdma_rsp *r, int tag) { /* NVMe CQE / RDMA SEND */ r->req.cqe = kmalloc(sizeof(*r->req.cqe), GFP_KERNEL); if (!r->req.cqe) goto out; r->send_sge.addr = ib_dma_map_single(ndev->device, r->req.cqe, sizeof(*r->req.cqe), DMA_TO_DEVICE); if (ib_dma_mapping_error(ndev->device, r->send_sge.addr)) goto out_free_rsp; if (ib_dma_pci_p2p_dma_supported(ndev->device)) r->req.p2p_client = &ndev->device->dev; r->send_sge.length = sizeof(*r->req.cqe); r->send_sge.lkey = ndev->pd->local_dma_lkey; r->send_cqe.done = nvmet_rdma_send_done; r->send_wr.wr_cqe = &r->send_cqe; r->send_wr.sg_list = &r->send_sge; r->send_wr.num_sge = 1; r->send_wr.send_flags = IB_SEND_SIGNALED; /* Data In / RDMA READ */ r->read_cqe.done = nvmet_rdma_read_data_done; /* Data Out / RDMA WRITE */ r->write_cqe.done = nvmet_rdma_write_data_done; r->tag = tag; return 0; out_free_rsp: kfree(r->req.cqe); out: return -ENOMEM; } static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev, struct nvmet_rdma_rsp *r) { ib_dma_unmap_single(ndev->device, r->send_sge.addr, sizeof(*r->req.cqe), DMA_TO_DEVICE); kfree(r->req.cqe); } static int nvmet_rdma_alloc_rsps(struct nvmet_rdma_queue *queue) { struct nvmet_rdma_device *ndev = queue->dev; int nr_rsps = queue->recv_queue_size * 2; int ret = -ENOMEM, i; if (sbitmap_init_node(&queue->rsp_tags, nr_rsps, -1, GFP_KERNEL, NUMA_NO_NODE, false, true)) goto out; queue->rsps = kcalloc(nr_rsps, sizeof(struct nvmet_rdma_rsp), GFP_KERNEL); if (!queue->rsps) goto out_free_sbitmap; for (i = 0; i < nr_rsps; i++) { struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; ret = nvmet_rdma_alloc_rsp(ndev, rsp, i); if (ret) goto out_free; } return 0; out_free: while (--i >= 0) nvmet_rdma_free_rsp(ndev, &queue->rsps[i]); kfree(queue->rsps); out_free_sbitmap: sbitmap_free(&queue->rsp_tags); out: return ret; } static void nvmet_rdma_free_rsps(struct nvmet_rdma_queue *queue) { struct nvmet_rdma_device *ndev = queue->dev; int i, nr_rsps = queue->recv_queue_size * 2; for (i = 0; i < nr_rsps; i++) nvmet_rdma_free_rsp(ndev, &queue->rsps[i]); kfree(queue->rsps); sbitmap_free(&queue->rsp_tags); } static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev, struct nvmet_rdma_cmd *cmd) { int ret; ib_dma_sync_single_for_device(ndev->device, cmd->sge[0].addr, cmd->sge[0].length, DMA_FROM_DEVICE); if (cmd->nsrq) ret = ib_post_srq_recv(cmd->nsrq->srq, &cmd->wr, NULL); else ret = ib_post_recv(cmd->queue->qp, &cmd->wr, NULL); if (unlikely(ret)) pr_err("post_recv cmd failed\n"); return ret; } static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue) { spin_lock(&queue->rsp_wr_wait_lock); while (!list_empty(&queue->rsp_wr_wait_list)) { struct nvmet_rdma_rsp *rsp; bool ret; rsp = list_entry(queue->rsp_wr_wait_list.next, struct nvmet_rdma_rsp, wait_list); list_del(&rsp->wait_list); spin_unlock(&queue->rsp_wr_wait_lock); ret = nvmet_rdma_execute_command(rsp); spin_lock(&queue->rsp_wr_wait_lock); if (!ret) { list_add(&rsp->wait_list, &queue->rsp_wr_wait_list); break; } } spin_unlock(&queue->rsp_wr_wait_lock); } static u16 nvmet_rdma_check_pi_status(struct ib_mr *sig_mr) { struct ib_mr_status mr_status; int ret; u16 status = 0; ret = ib_check_mr_status(sig_mr, IB_MR_CHECK_SIG_STATUS, &mr_status); if (ret) { pr_err("ib_check_mr_status failed, ret %d\n", ret); return NVME_SC_INVALID_PI; } if (mr_status.fail_status & IB_MR_CHECK_SIG_STATUS) { switch (mr_status.sig_err.err_type) { case IB_SIG_BAD_GUARD: status = NVME_SC_GUARD_CHECK; break; case IB_SIG_BAD_REFTAG: status = NVME_SC_REFTAG_CHECK; break; case IB_SIG_BAD_APPTAG: status = NVME_SC_APPTAG_CHECK; break; } pr_err("PI error found type %d expected 0x%x vs actual 0x%x\n", mr_status.sig_err.err_type, mr_status.sig_err.expected, mr_status.sig_err.actual); } return status; } static void nvmet_rdma_set_sig_domain(struct blk_integrity *bi, struct nvme_command *cmd, struct ib_sig_domain *domain, u16 control, u8 pi_type) { domain->sig_type = IB_SIG_TYPE_T10_DIF; domain->sig.dif.bg_type = IB_T10DIF_CRC; domain->sig.dif.pi_interval = 1 << bi->interval_exp; domain->sig.dif.ref_tag = le32_to_cpu(cmd->rw.reftag); if (control & NVME_RW_PRINFO_PRCHK_REF) domain->sig.dif.ref_remap = true; domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.lbat); domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.lbatm); domain->sig.dif.app_escape = true; if (pi_type == NVME_NS_DPS_PI_TYPE3) domain->sig.dif.ref_escape = true; } static void nvmet_rdma_set_sig_attrs(struct nvmet_req *req, struct ib_sig_attrs *sig_attrs) { struct nvme_command *cmd = req->cmd; u16 control = le16_to_cpu(cmd->rw.control); u8 pi_type = req->ns->pi_type; struct blk_integrity *bi; bi = bdev_get_integrity(req->ns->bdev); memset(sig_attrs, 0, sizeof(*sig_attrs)); if (control & NVME_RW_PRINFO_PRACT) { /* for WRITE_INSERT/READ_STRIP no wire domain */ sig_attrs->wire.sig_type = IB_SIG_TYPE_NONE; nvmet_rdma_set_sig_domain(bi, cmd, &sig_attrs->mem, control, pi_type); /* Clear the PRACT bit since HCA will generate/verify the PI */ control &= ~NVME_RW_PRINFO_PRACT; cmd->rw.control = cpu_to_le16(control); /* PI is added by the HW */ req->transfer_len += req->metadata_len; } else { /* for WRITE_PASS/READ_PASS both wire/memory domains exist */ nvmet_rdma_set_sig_domain(bi, cmd, &sig_attrs->wire, control, pi_type); nvmet_rdma_set_sig_domain(bi, cmd, &sig_attrs->mem, control, pi_type); } if (control & NVME_RW_PRINFO_PRCHK_REF) sig_attrs->check_mask |= IB_SIG_CHECK_REFTAG; if (control & NVME_RW_PRINFO_PRCHK_GUARD) sig_attrs->check_mask |= IB_SIG_CHECK_GUARD; if (control & NVME_RW_PRINFO_PRCHK_APP) sig_attrs->check_mask |= IB_SIG_CHECK_APPTAG; } static int nvmet_rdma_rw_ctx_init(struct nvmet_rdma_rsp *rsp, u64 addr, u32 key, struct ib_sig_attrs *sig_attrs) { struct rdma_cm_id *cm_id = rsp->queue->cm_id; struct nvmet_req *req = &rsp->req; int ret; if (req->metadata_len) ret = rdma_rw_ctx_signature_init(&rsp->rw, cm_id->qp, cm_id->port_num, req->sg, req->sg_cnt, req->metadata_sg, req->metadata_sg_cnt, sig_attrs, addr, key, nvmet_data_dir(req)); else ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num, req->sg, req->sg_cnt, 0, addr, key, nvmet_data_dir(req)); return ret; } static void nvmet_rdma_rw_ctx_destroy(struct nvmet_rdma_rsp *rsp) { struct rdma_cm_id *cm_id = rsp->queue->cm_id; struct nvmet_req *req = &rsp->req; if (req->metadata_len) rdma_rw_ctx_destroy_signature(&rsp->rw, cm_id->qp, cm_id->port_num, req->sg, req->sg_cnt, req->metadata_sg, req->metadata_sg_cnt, nvmet_data_dir(req)); else rdma_rw_ctx_destroy(&rsp->rw, cm_id->qp, cm_id->port_num, req->sg, req->sg_cnt, nvmet_data_dir(req)); } static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp) { struct nvmet_rdma_queue *queue = rsp->queue; atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail); if (rsp->n_rdma) nvmet_rdma_rw_ctx_destroy(rsp); if (rsp->req.sg != rsp->cmd->inline_sg) nvmet_req_free_sgls(&rsp->req); if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list))) nvmet_rdma_process_wr_wait_list(queue); nvmet_rdma_put_rsp(rsp); } static void nvmet_rdma_error_comp(struct nvmet_rdma_queue *queue) { if (queue->nvme_sq.ctrl) { nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl); } else { /* * we didn't setup the controller yet in case * of admin connect error, just disconnect and * cleanup the queue */ nvmet_rdma_queue_disconnect(queue); } } static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) { struct nvmet_rdma_rsp *rsp = container_of(wc->wr_cqe, struct nvmet_rdma_rsp, send_cqe); struct nvmet_rdma_queue *queue = wc->qp->qp_context; nvmet_rdma_release_rsp(rsp); if (unlikely(wc->status != IB_WC_SUCCESS && wc->status != IB_WC_WR_FLUSH_ERR)) { pr_err("SEND for CQE 0x%p failed with status %s (%d).\n", wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status); nvmet_rdma_error_comp(queue); } } static void nvmet_rdma_queue_response(struct nvmet_req *req) { struct nvmet_rdma_rsp *rsp = container_of(req, struct nvmet_rdma_rsp, req); struct rdma_cm_id *cm_id = rsp->queue->cm_id; struct ib_send_wr *first_wr; if (rsp->invalidate_rkey) { rsp->send_wr.opcode = IB_WR_SEND_WITH_INV; rsp->send_wr.ex.invalidate_rkey = rsp->invalidate_rkey; } else { rsp->send_wr.opcode = IB_WR_SEND; } if (nvmet_rdma_need_data_out(rsp)) { if (rsp->req.metadata_len) first_wr = rdma_rw_ctx_wrs(&rsp->rw, cm_id->qp, cm_id->port_num, &rsp->write_cqe, NULL); else first_wr = rdma_rw_ctx_wrs(&rsp->rw, cm_id->qp, cm_id->port_num, NULL, &rsp->send_wr); } else { first_wr = &rsp->send_wr; } nvmet_rdma_post_recv(rsp->queue->dev, rsp->cmd); ib_dma_sync_single_for_device(rsp->queue->dev->device, rsp->send_sge.addr, rsp->send_sge.length, DMA_TO_DEVICE); if (unlikely(ib_post_send(cm_id->qp, first_wr, NULL))) { pr_err("sending cmd response failed\n"); nvmet_rdma_release_rsp(rsp); } } static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc) { struct nvmet_rdma_rsp *rsp = container_of(wc->wr_cqe, struct nvmet_rdma_rsp, read_cqe); struct nvmet_rdma_queue *queue = wc->qp->qp_context; u16 status = 0; WARN_ON(rsp->n_rdma <= 0); atomic_add(rsp->n_rdma, &queue->sq_wr_avail); rsp->n_rdma = 0; if (unlikely(wc->status != IB_WC_SUCCESS)) { nvmet_rdma_rw_ctx_destroy(rsp); nvmet_req_uninit(&rsp->req); nvmet_rdma_release_rsp(rsp); if (wc->status != IB_WC_WR_FLUSH_ERR) { pr_info("RDMA READ for CQE 0x%p failed with status %s (%d).\n", wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status); nvmet_rdma_error_comp(queue); } return; } if (rsp->req.metadata_len) status = nvmet_rdma_check_pi_status(rsp->rw.reg->mr); nvmet_rdma_rw_ctx_destroy(rsp); if (unlikely(status)) nvmet_req_complete(&rsp->req, status); else rsp->req.execute(&rsp->req); } static void nvmet_rdma_write_data_done(struct ib_cq *cq, struct ib_wc *wc) { struct nvmet_rdma_rsp *rsp = container_of(wc->wr_cqe, struct nvmet_rdma_rsp, write_cqe); struct nvmet_rdma_queue *queue = wc->qp->qp_context; struct rdma_cm_id *cm_id = rsp->queue->cm_id; u16 status; if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) return; WARN_ON(rsp->n_rdma <= 0); atomic_add(rsp->n_rdma, &queue->sq_wr_avail); rsp->n_rdma = 0; if (unlikely(wc->status != IB_WC_SUCCESS)) { nvmet_rdma_rw_ctx_destroy(rsp); nvmet_req_uninit(&rsp->req); nvmet_rdma_release_rsp(rsp); if (wc->status != IB_WC_WR_FLUSH_ERR) { pr_info("RDMA WRITE for CQE failed with status %s (%d).\n", ib_wc_status_msg(wc->status), wc->status); nvmet_rdma_error_comp(queue); } return; } /* * Upon RDMA completion check the signature status * - if succeeded send good NVMe response * - if failed send bad NVMe response with appropriate error */ status = nvmet_rdma_check_pi_status(rsp->rw.reg->mr); if (unlikely(status)) rsp->req.cqe->status = cpu_to_le16(status << 1); nvmet_rdma_rw_ctx_destroy(rsp); if (unlikely(ib_post_send(cm_id->qp, &rsp->send_wr, NULL))) { pr_err("sending cmd response failed\n"); nvmet_rdma_release_rsp(rsp); } } static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len, u64 off) { int sg_count = num_pages(len); struct scatterlist *sg; int i; sg = rsp->cmd->inline_sg; for (i = 0; i < sg_count; i++, sg++) { if (i < sg_count - 1) sg_unmark_end(sg); else sg_mark_end(sg); sg->offset = off; sg->length = min_t(int, len, PAGE_SIZE - off); len -= sg->length; if (!i) off = 0; } rsp->req.sg = rsp->cmd->inline_sg; rsp->req.sg_cnt = sg_count; } static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp) { struct nvme_sgl_desc *sgl = &rsp->req.cmd->common.dptr.sgl; u64 off = le64_to_cpu(sgl->addr); u32 len = le32_to_cpu(sgl->length); if (!nvme_is_write(rsp->req.cmd)) { rsp->req.error_loc = offsetof(struct nvme_common_command, opcode); return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; } if (off + len > rsp->queue->dev->inline_data_size) { pr_err("invalid inline data offset!\n"); return NVME_SC_SGL_INVALID_OFFSET | NVME_STATUS_DNR; } /* no data command? */ if (!len) return 0; nvmet_rdma_use_inline_sg(rsp, len, off); rsp->flags |= NVMET_RDMA_REQ_INLINE_DATA; rsp->req.transfer_len += len; return 0; } static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp, struct nvme_keyed_sgl_desc *sgl, bool invalidate) { u64 addr = le64_to_cpu(sgl->addr); u32 key = get_unaligned_le32(sgl->key); struct ib_sig_attrs sig_attrs; int ret; rsp->req.transfer_len = get_unaligned_le24(sgl->length); /* no data command? */ if (!rsp->req.transfer_len) return 0; if (rsp->req.metadata_len) nvmet_rdma_set_sig_attrs(&rsp->req, &sig_attrs); ret = nvmet_req_alloc_sgls(&rsp->req); if (unlikely(ret < 0)) goto error_out; ret = nvmet_rdma_rw_ctx_init(rsp, addr, key, &sig_attrs); if (unlikely(ret < 0)) goto error_out; rsp->n_rdma += ret; if (invalidate) rsp->invalidate_rkey = key; return 0; error_out: rsp->req.transfer_len = 0; return NVME_SC_INTERNAL; } static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp) { struct nvme_keyed_sgl_desc *sgl = &rsp->req.cmd->common.dptr.ksgl; switch (sgl->type >> 4) { case NVME_SGL_FMT_DATA_DESC: switch (sgl->type & 0xf) { case NVME_SGL_FMT_OFFSET: return nvmet_rdma_map_sgl_inline(rsp); default: pr_err("invalid SGL subtype: %#x\n", sgl->type); rsp->req.error_loc = offsetof(struct nvme_common_command, dptr); return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; } case NVME_KEY_SGL_FMT_DATA_DESC: switch (sgl->type & 0xf) { case NVME_SGL_FMT_ADDRESS | NVME_SGL_FMT_INVALIDATE: return nvmet_rdma_map_sgl_keyed(rsp, sgl, true); case NVME_SGL_FMT_ADDRESS: return nvmet_rdma_map_sgl_keyed(rsp, sgl, false); default: pr_err("invalid SGL subtype: %#x\n", sgl->type); rsp->req.error_loc = offsetof(struct nvme_common_command, dptr); return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; } default: pr_err("invalid SGL type: %#x\n", sgl->type); rsp->req.error_loc = offsetof(struct nvme_common_command, dptr); return NVME_SC_SGL_INVALID_TYPE | NVME_STATUS_DNR; } } static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp) { struct nvmet_rdma_queue *queue = rsp->queue; if (unlikely(atomic_sub_return(1 + rsp->n_rdma, &queue->sq_wr_avail) < 0)) { pr_debug("IB send queue full (needed %d): queue %u cntlid %u\n", 1 + rsp->n_rdma, queue->idx, queue->nvme_sq.ctrl->cntlid); atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail); return false; } if (nvmet_rdma_need_data_in(rsp)) { if (rdma_rw_ctx_post(&rsp->rw, queue->qp, queue->cm_id->port_num, &rsp->read_cqe, NULL)) nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR); } else { rsp->req.execute(&rsp->req); } return true; } static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue, struct nvmet_rdma_rsp *cmd) { u16 status; ib_dma_sync_single_for_cpu(queue->dev->device, cmd->cmd->sge[0].addr, cmd->cmd->sge[0].length, DMA_FROM_DEVICE); ib_dma_sync_single_for_cpu(queue->dev->device, cmd->send_sge.addr, cmd->send_sge.length, DMA_TO_DEVICE); if (!nvmet_req_init(&cmd->req, &queue->nvme_cq, &queue->nvme_sq, &nvmet_rdma_ops)) return; status = nvmet_rdma_map_sgl(cmd); if (status) goto out_err; if (unlikely(!nvmet_rdma_execute_command(cmd))) { spin_lock(&queue->rsp_wr_wait_lock); list_add_tail(&cmd->wait_list, &queue->rsp_wr_wait_list); spin_unlock(&queue->rsp_wr_wait_lock); } return; out_err: nvmet_req_complete(&cmd->req, status); } static bool nvmet_rdma_recv_not_live(struct nvmet_rdma_queue *queue, struct nvmet_rdma_rsp *rsp) { unsigned long flags; bool ret = true; spin_lock_irqsave(&queue->state_lock, flags); /* * recheck queue state is not live to prevent a race condition * with RDMA_CM_EVENT_ESTABLISHED handler. */ if (queue->state == NVMET_RDMA_Q_LIVE) ret = false; else if (queue->state == NVMET_RDMA_Q_CONNECTING) list_add_tail(&rsp->wait_list, &queue->rsp_wait_list); else nvmet_rdma_put_rsp(rsp); spin_unlock_irqrestore(&queue->state_lock, flags); return ret; } static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) { struct nvmet_rdma_cmd *cmd = container_of(wc->wr_cqe, struct nvmet_rdma_cmd, cqe); struct nvmet_rdma_queue *queue = wc->qp->qp_context; struct nvmet_rdma_rsp *rsp; if (unlikely(wc->status != IB_WC_SUCCESS)) { if (wc->status != IB_WC_WR_FLUSH_ERR) { pr_err("RECV for CQE 0x%p failed with status %s (%d)\n", wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status); nvmet_rdma_error_comp(queue); } return; } if (unlikely(wc->byte_len < sizeof(struct nvme_command))) { pr_err("Ctrl Fatal Error: capsule size less than 64 bytes\n"); nvmet_rdma_error_comp(queue); return; } cmd->queue = queue; rsp = nvmet_rdma_get_rsp(queue); if (unlikely(!rsp)) { /* * we get here only under memory pressure, * silently drop and have the host retry * as we can't even fail it. */ nvmet_rdma_post_recv(queue->dev, cmd); return; } rsp->queue = queue; rsp->cmd = cmd; rsp->flags = 0; rsp->req.cmd = cmd->nvme_cmd; rsp->req.port = queue->port; rsp->n_rdma = 0; rsp->invalidate_rkey = 0; if (unlikely(queue->state != NVMET_RDMA_Q_LIVE) && nvmet_rdma_recv_not_live(queue, rsp)) return; nvmet_rdma_handle_command(queue, rsp); } static void nvmet_rdma_destroy_srq(struct nvmet_rdma_srq *nsrq) { nvmet_rdma_free_cmds(nsrq->ndev, nsrq->cmds, nsrq->ndev->srq_size, false); ib_destroy_srq(nsrq->srq); kfree(nsrq); } static void nvmet_rdma_destroy_srqs(struct nvmet_rdma_device *ndev) { int i; if (!ndev->srqs) return; for (i = 0; i < ndev->srq_count; i++) nvmet_rdma_destroy_srq(ndev->srqs[i]); kfree(ndev->srqs); } static struct nvmet_rdma_srq * nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev) { struct ib_srq_init_attr srq_attr = { NULL, }; size_t srq_size = ndev->srq_size; struct nvmet_rdma_srq *nsrq; struct ib_srq *srq; int ret, i; nsrq = kzalloc(sizeof(*nsrq), GFP_KERNEL); if (!nsrq) return ERR_PTR(-ENOMEM); srq_attr.attr.max_wr = srq_size; srq_attr.attr.max_sge = 1 + ndev->inline_page_count; srq_attr.attr.srq_limit = 0; srq_attr.srq_type = IB_SRQT_BASIC; srq = ib_create_srq(ndev->pd, &srq_attr); if (IS_ERR(srq)) { ret = PTR_ERR(srq); goto out_free; } nsrq->cmds = nvmet_rdma_alloc_cmds(ndev, srq_size, false); if (IS_ERR(nsrq->cmds)) { ret = PTR_ERR(nsrq->cmds); goto out_destroy_srq; } nsrq->srq = srq; nsrq->ndev = ndev; for (i = 0; i < srq_size; i++) { nsrq->cmds[i].nsrq = nsrq; ret = nvmet_rdma_post_recv(ndev, &nsrq->cmds[i]); if (ret) goto out_free_cmds; } return nsrq; out_free_cmds: nvmet_rdma_free_cmds(ndev, nsrq->cmds, srq_size, false); out_destroy_srq: ib_destroy_srq(srq); out_free: kfree(nsrq); return ERR_PTR(ret); } static int nvmet_rdma_init_srqs(struct nvmet_rdma_device *ndev) { int i, ret; if (!ndev->device->attrs.max_srq_wr || !ndev->device->attrs.max_srq) { /* * If SRQs aren't supported we just go ahead and use normal * non-shared receive queues. */ pr_info("SRQ requested but not supported.\n"); return 0; } ndev->srq_size = min(ndev->device->attrs.max_srq_wr, nvmet_rdma_srq_size); ndev->srq_count = min(ndev->device->num_comp_vectors, ndev->device->attrs.max_srq); ndev->srqs = kcalloc(ndev->srq_count, sizeof(*ndev->srqs), GFP_KERNEL); if (!ndev->srqs) return -ENOMEM; for (i = 0; i < ndev->srq_count; i++) { ndev->srqs[i] = nvmet_rdma_init_srq(ndev); if (IS_ERR(ndev->srqs[i])) { ret = PTR_ERR(ndev->srqs[i]); goto err_srq; } } return 0; err_srq: while (--i >= 0) nvmet_rdma_destroy_srq(ndev->srqs[i]); kfree(ndev->srqs); return ret; } static void nvmet_rdma_free_dev(struct kref *ref) { struct nvmet_rdma_device *ndev = container_of(ref, struct nvmet_rdma_device, ref); mutex_lock(&device_list_mutex); list_del(&ndev->entry); mutex_unlock(&device_list_mutex); nvmet_rdma_destroy_srqs(ndev); ib_dealloc_pd(ndev->pd); kfree(ndev); } static struct nvmet_rdma_device * nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id) { struct nvmet_rdma_port *port = cm_id->context; struct nvmet_port *nport = port->nport; struct nvmet_rdma_device *ndev; int inline_page_count; int inline_sge_count; int ret; mutex_lock(&device_list_mutex); list_for_each_entry(ndev, &device_list, entry) { if (ndev->device->node_guid == cm_id->device->node_guid && kref_get_unless_zero(&ndev->ref)) goto out_unlock; } ndev = kzalloc(sizeof(*ndev), GFP_KERNEL); if (!ndev) goto out_err; inline_page_count = num_pages(nport->inline_data_size); inline_sge_count = max(cm_id->device->attrs.max_sge_rd, cm_id->device->attrs.max_recv_sge) - 1; if (inline_page_count > inline_sge_count) { pr_warn("inline_data_size %d cannot be supported by device %s. Reducing to %lu.\n", nport->inline_data_size, cm_id->device->name, inline_sge_count * PAGE_SIZE); nport->inline_data_size = inline_sge_count * PAGE_SIZE; inline_page_count = inline_sge_count; } ndev->inline_data_size = nport->inline_data_size; ndev->inline_page_count = inline_page_count; if (nport->pi_enable && !(cm_id->device->attrs.kernel_cap_flags & IBK_INTEGRITY_HANDOVER)) { pr_warn("T10-PI is not supported by device %s. Disabling it\n", cm_id->device->name); nport->pi_enable = false; } ndev->device = cm_id->device; kref_init(&ndev->ref); ndev->pd = ib_alloc_pd(ndev->device, 0); if (IS_ERR(ndev->pd)) goto out_free_dev; if (nvmet_rdma_use_srq) { ret = nvmet_rdma_init_srqs(ndev); if (ret) goto out_free_pd; } list_add(&ndev->entry, &device_list); out_unlock: mutex_unlock(&device_list_mutex); pr_debug("added %s.\n", ndev->device->name); return ndev; out_free_pd: ib_dealloc_pd(ndev->pd); out_free_dev: kfree(ndev); out_err: mutex_unlock(&device_list_mutex); return NULL; } static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) { struct ib_qp_init_attr qp_attr = { }; struct nvmet_rdma_device *ndev = queue->dev; int nr_cqe, ret, i, factor; /* * Reserve CQ slots for RECV + RDMA_READ/RDMA_WRITE + RDMA_SEND. */ nr_cqe = queue->recv_queue_size + 2 * queue->send_queue_size; queue->cq = ib_cq_pool_get(ndev->device, nr_cqe + 1, queue->comp_vector, IB_POLL_WORKQUEUE); if (IS_ERR(queue->cq)) { ret = PTR_ERR(queue->cq); pr_err("failed to create CQ cqe= %d ret= %d\n", nr_cqe + 1, ret); goto out; } qp_attr.qp_context = queue; qp_attr.event_handler = nvmet_rdma_qp_event; qp_attr.send_cq = queue->cq; qp_attr.recv_cq = queue->cq; qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; qp_attr.qp_type = IB_QPT_RC; /* +1 for drain */ qp_attr.cap.max_send_wr = queue->send_queue_size + 1; factor = rdma_rw_mr_factor(ndev->device, queue->cm_id->port_num, 1 << NVMET_RDMA_MAX_MDTS); qp_attr.cap.max_rdma_ctxs = queue->send_queue_size * factor; qp_attr.cap.max_send_sge = max(ndev->device->attrs.max_sge_rd, ndev->device->attrs.max_send_sge); if (queue->nsrq) { qp_attr.srq = queue->nsrq->srq; } else { /* +1 for drain */ qp_attr.cap.max_recv_wr = 1 + queue->recv_queue_size; qp_attr.cap.max_recv_sge = 1 + ndev->inline_page_count; } if (queue->port->pi_enable && queue->host_qid) qp_attr.create_flags |= IB_QP_CREATE_INTEGRITY_EN; ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr); if (ret) { pr_err("failed to create_qp ret= %d\n", ret); goto err_destroy_cq; } queue->qp = queue->cm_id->qp; atomic_set(&queue->sq_wr_avail, qp_attr.cap.max_send_wr); pr_debug("%s: max_cqe= %d max_sge= %d sq_size = %d cm_id= %p\n", __func__, queue->cq->cqe, qp_attr.cap.max_send_sge, qp_attr.cap.max_send_wr, queue->cm_id); if (!queue->nsrq) { for (i = 0; i < queue->recv_queue_size; i++) { queue->cmds[i].queue = queue; ret = nvmet_rdma_post_recv(ndev, &queue->cmds[i]); if (ret) goto err_destroy_qp; } } out: return ret; err_destroy_qp: rdma_destroy_qp(queue->cm_id); err_destroy_cq: ib_cq_pool_put(queue->cq, nr_cqe + 1); goto out; } static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue) { ib_drain_qp(queue->qp); if (queue->cm_id) rdma_destroy_id(queue->cm_id); ib_destroy_qp(queue->qp); ib_cq_pool_put(queue->cq, queue->recv_queue_size + 2 * queue->send_queue_size + 1); } static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue) { pr_debug("freeing queue %d\n", queue->idx); nvmet_sq_destroy(&queue->nvme_sq); nvmet_rdma_destroy_queue_ib(queue); if (!queue->nsrq) { nvmet_rdma_free_cmds(queue->dev, queue->cmds, queue->recv_queue_size, !queue->host_qid); } nvmet_rdma_free_rsps(queue); ida_free(&nvmet_rdma_queue_ida, queue->idx); kfree(queue); } static void nvmet_rdma_release_queue_work(struct work_struct *w) { struct nvmet_rdma_queue *queue = container_of(w, struct nvmet_rdma_queue, release_work); struct nvmet_rdma_device *dev = queue->dev; nvmet_rdma_free_queue(queue); kref_put(&dev->ref, nvmet_rdma_free_dev); } static int nvmet_rdma_parse_cm_connect_req(struct rdma_conn_param *conn, struct nvmet_rdma_queue *queue) { struct nvme_rdma_cm_req *req; req = (struct nvme_rdma_cm_req *)conn->private_data; if (!req || conn->private_data_len == 0) return NVME_RDMA_CM_INVALID_LEN; if (le16_to_cpu(req->recfmt) != NVME_RDMA_CM_FMT_1_0) return NVME_RDMA_CM_INVALID_RECFMT; queue->host_qid = le16_to_cpu(req->qid); /* * req->hsqsize corresponds to our recv queue size plus 1 * req->hrqsize corresponds to our send queue size */ queue->recv_queue_size = le16_to_cpu(req->hsqsize) + 1; queue->send_queue_size = le16_to_cpu(req->hrqsize); if (!queue->host_qid && queue->recv_queue_size > NVME_AQ_DEPTH) return NVME_RDMA_CM_INVALID_HSQSIZE; /* XXX: Should we enforce some kind of max for IO queues? */ return 0; } static int nvmet_rdma_cm_reject(struct rdma_cm_id *cm_id, enum nvme_rdma_cm_status status) { struct nvme_rdma_cm_rej rej; pr_debug("rejecting connect request: status %d (%s)\n", status, nvme_rdma_cm_msg(status)); rej.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0); rej.sts = cpu_to_le16(status); return rdma_reject(cm_id, (void *)&rej, sizeof(rej), IB_CM_REJ_CONSUMER_DEFINED); } static struct nvmet_rdma_queue * nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev, struct rdma_cm_id *cm_id, struct rdma_cm_event *event) { struct nvmet_rdma_port *port = cm_id->context; struct nvmet_rdma_queue *queue; int ret; queue = kzalloc(sizeof(*queue), GFP_KERNEL); if (!queue) { ret = NVME_RDMA_CM_NO_RSC; goto out_reject; } ret = nvmet_sq_init(&queue->nvme_sq); if (ret) { ret = NVME_RDMA_CM_NO_RSC; goto out_free_queue; } ret = nvmet_rdma_parse_cm_connect_req(&event->param.conn, queue); if (ret) goto out_destroy_sq; /* * Schedules the actual release because calling rdma_destroy_id from * inside a CM callback would trigger a deadlock. (great API design..) */ INIT_WORK(&queue->release_work, nvmet_rdma_release_queue_work); queue->dev = ndev; queue->cm_id = cm_id; queue->port = port->nport; spin_lock_init(&queue->state_lock); queue->state = NVMET_RDMA_Q_CONNECTING; INIT_LIST_HEAD(&queue->rsp_wait_list); INIT_LIST_HEAD(&queue->rsp_wr_wait_list); spin_lock_init(&queue->rsp_wr_wait_lock); INIT_LIST_HEAD(&queue->queue_list); queue->idx = ida_alloc(&nvmet_rdma_queue_ida, GFP_KERNEL); if (queue->idx < 0) { ret = NVME_RDMA_CM_NO_RSC; goto out_destroy_sq; } /* * Spread the io queues across completion vectors, * but still keep all admin queues on vector 0. */ queue->comp_vector = !queue->host_qid ? 0 : queue->idx % ndev->device->num_comp_vectors; ret = nvmet_rdma_alloc_rsps(queue); if (ret) { ret = NVME_RDMA_CM_NO_RSC; goto out_ida_remove; } if (ndev->srqs) { queue->nsrq = ndev->srqs[queue->comp_vector % ndev->srq_count]; } else { queue->cmds = nvmet_rdma_alloc_cmds(ndev, queue->recv_queue_size, !queue->host_qid); if (IS_ERR(queue->cmds)) { ret = NVME_RDMA_CM_NO_RSC; goto out_free_responses; } } ret = nvmet_rdma_create_queue_ib(queue); if (ret) { pr_err("%s: creating RDMA queue failed (%d).\n", __func__, ret); ret = NVME_RDMA_CM_NO_RSC; goto out_free_cmds; } return queue; out_free_cmds: if (!queue->nsrq) { nvmet_rdma_free_cmds(queue->dev, queue->cmds, queue->recv_queue_size, !queue->host_qid); } out_free_responses: nvmet_rdma_free_rsps(queue); out_ida_remove: ida_free(&nvmet_rdma_queue_ida, queue->idx); out_destroy_sq: nvmet_sq_destroy(&queue->nvme_sq); out_free_queue: kfree(queue); out_reject: nvmet_rdma_cm_reject(cm_id, ret); return NULL; } static void nvmet_rdma_qp_event(struct ib_event *event, void *priv) { struct nvmet_rdma_queue *queue = priv; switch (event->event) { case IB_EVENT_COMM_EST: rdma_notify(queue->cm_id, event->event); break; case IB_EVENT_QP_LAST_WQE_REACHED: pr_debug("received last WQE reached event for queue=0x%p\n", queue); break; default: pr_err("received IB QP event: %s (%d)\n", ib_event_msg(event->event), event->event); break; } } static int nvmet_rdma_cm_accept(struct rdma_cm_id *cm_id, struct nvmet_rdma_queue *queue, struct rdma_conn_param *p) { struct rdma_conn_param param = { }; struct nvme_rdma_cm_rep priv = { }; int ret = -ENOMEM; param.rnr_retry_count = 7; param.flow_control = 1; param.initiator_depth = min_t(u8, p->initiator_depth, queue->dev->device->attrs.max_qp_init_rd_atom); param.private_data = &priv; param.private_data_len = sizeof(priv); priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0); priv.crqsize = cpu_to_le16(queue->recv_queue_size); ret = rdma_accept(cm_id, &param); if (ret) pr_err("rdma_accept failed (error code = %d)\n", ret); return ret; } static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id, struct rdma_cm_event *event) { struct nvmet_rdma_device *ndev; struct nvmet_rdma_queue *queue; int ret = -EINVAL; ndev = nvmet_rdma_find_get_device(cm_id); if (!ndev) { nvmet_rdma_cm_reject(cm_id, NVME_RDMA_CM_NO_RSC); return -ECONNREFUSED; } queue = nvmet_rdma_alloc_queue(ndev, cm_id, event); if (!queue) { ret = -ENOMEM; goto put_device; } if (queue->host_qid == 0) { struct nvmet_rdma_queue *q; int pending = 0; /* Check for pending controller teardown */ mutex_lock(&nvmet_rdma_queue_mutex); list_for_each_entry(q, &nvmet_rdma_queue_list, queue_list) { if (q->nvme_sq.ctrl == queue->nvme_sq.ctrl && q->state == NVMET_RDMA_Q_DISCONNECTING) pending++; } mutex_unlock(&nvmet_rdma_queue_mutex); if (pending > NVMET_RDMA_BACKLOG) return NVME_SC_CONNECT_CTRL_BUSY; } ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn); if (ret) { /* * Don't destroy the cm_id in free path, as we implicitly * destroy the cm_id here with non-zero ret code. */ queue->cm_id = NULL; goto free_queue; } mutex_lock(&nvmet_rdma_queue_mutex); list_add_tail(&queue->queue_list, &nvmet_rdma_queue_list); mutex_unlock(&nvmet_rdma_queue_mutex); return 0; free_queue: nvmet_rdma_free_queue(queue); put_device: kref_put(&ndev->ref, nvmet_rdma_free_dev); return ret; } static void nvmet_rdma_queue_established(struct nvmet_rdma_queue *queue) { unsigned long flags; spin_lock_irqsave(&queue->state_lock, flags); if (queue->state != NVMET_RDMA_Q_CONNECTING) { pr_warn("trying to establish a connected queue\n"); goto out_unlock; } queue->state = NVMET_RDMA_Q_LIVE; while (!list_empty(&queue->rsp_wait_list)) { struct nvmet_rdma_rsp *cmd; cmd = list_first_entry(&queue->rsp_wait_list, struct nvmet_rdma_rsp, wait_list); list_del(&cmd->wait_list); spin_unlock_irqrestore(&queue->state_lock, flags); nvmet_rdma_handle_command(queue, cmd); spin_lock_irqsave(&queue->state_lock, flags); } out_unlock: spin_unlock_irqrestore(&queue->state_lock, flags); } static void __nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue) { bool disconnect = false; unsigned long flags; pr_debug("cm_id= %p queue->state= %d\n", queue->cm_id, queue->state); spin_lock_irqsave(&queue->state_lock, flags); switch (queue->state) { case NVMET_RDMA_Q_CONNECTING: while (!list_empty(&queue->rsp_wait_list)) { struct nvmet_rdma_rsp *rsp; rsp = list_first_entry(&queue->rsp_wait_list, struct nvmet_rdma_rsp, wait_list); list_del(&rsp->wait_list); nvmet_rdma_put_rsp(rsp); } fallthrough; case NVMET_RDMA_Q_LIVE: queue->state = NVMET_RDMA_Q_DISCONNECTING; disconnect = true; break; case NVMET_RDMA_Q_DISCONNECTING: break; } spin_unlock_irqrestore(&queue->state_lock, flags); if (disconnect) { rdma_disconnect(queue->cm_id); queue_work(nvmet_wq, &queue->release_work); } } static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue) { bool disconnect = false; mutex_lock(&nvmet_rdma_queue_mutex); if (!list_empty(&queue->queue_list)) { list_del_init(&queue->queue_list); disconnect = true; } mutex_unlock(&nvmet_rdma_queue_mutex); if (disconnect) __nvmet_rdma_queue_disconnect(queue); } static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id, struct nvmet_rdma_queue *queue) { WARN_ON_ONCE(queue->state != NVMET_RDMA_Q_CONNECTING); mutex_lock(&nvmet_rdma_queue_mutex); if (!list_empty(&queue->queue_list)) list_del_init(&queue->queue_list); mutex_unlock(&nvmet_rdma_queue_mutex); pr_err("failed to connect queue %d\n", queue->idx); queue_work(nvmet_wq, &queue->release_work); } /** * nvmet_rdma_device_removal() - Handle RDMA device removal * @cm_id: rdma_cm id, used for nvmet port * @queue: nvmet rdma queue (cm id qp_context) * * DEVICE_REMOVAL event notifies us that the RDMA device is about * to unplug. Note that this event can be generated on a normal * queue cm_id and/or a device bound listener cm_id (where in this * case queue will be null). * * We registered an ib_client to handle device removal for queues, * so we only need to handle the listening port cm_ids. In this case * we nullify the priv to prevent double cm_id destruction and destroying * the cm_id implicitely by returning a non-zero rc to the callout. */ static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id, struct nvmet_rdma_queue *queue) { struct nvmet_rdma_port *port; if (queue) { /* * This is a queue cm_id. we have registered * an ib_client to handle queues removal * so don't interfear and just return. */ return 0; } port = cm_id->context; /* * This is a listener cm_id. Make sure that * future remove_port won't invoke a double * cm_id destroy. use atomic xchg to make sure * we don't compete with remove_port. */ if (xchg(&port->cm_id, NULL) != cm_id) return 0; /* * We need to return 1 so that the core will destroy * it's own ID. What a great API design.. */ return 1; } static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event) { struct nvmet_rdma_queue *queue = NULL; int ret = 0; if (cm_id->qp) queue = cm_id->qp->qp_context; pr_debug("%s (%d): status %d id %p\n", rdma_event_msg(event->event), event->event, event->status, cm_id); switch (event->event) { case RDMA_CM_EVENT_CONNECT_REQUEST: ret = nvmet_rdma_queue_connect(cm_id, event); break; case RDMA_CM_EVENT_ESTABLISHED: nvmet_rdma_queue_established(queue); break; case RDMA_CM_EVENT_ADDR_CHANGE: if (!queue) { struct nvmet_rdma_port *port = cm_id->context; queue_delayed_work(nvmet_wq, &port->repair_work, 0); break; } fallthrough; case RDMA_CM_EVENT_DISCONNECTED: case RDMA_CM_EVENT_TIMEWAIT_EXIT: nvmet_rdma_queue_disconnect(queue); break; case RDMA_CM_EVENT_DEVICE_REMOVAL: ret = nvmet_rdma_device_removal(cm_id, queue); break; case RDMA_CM_EVENT_REJECTED: pr_debug("Connection rejected: %s\n", rdma_reject_msg(cm_id, event->status)); fallthrough; case RDMA_CM_EVENT_UNREACHABLE: case RDMA_CM_EVENT_CONNECT_ERROR: nvmet_rdma_queue_connect_fail(cm_id, queue); break; default: pr_err("received unrecognized RDMA CM event %d\n", event->event); break; } return ret; } static void nvmet_rdma_delete_ctrl(struct nvmet_ctrl *ctrl) { struct nvmet_rdma_queue *queue, *n; mutex_lock(&nvmet_rdma_queue_mutex); list_for_each_entry_safe(queue, n, &nvmet_rdma_queue_list, queue_list) { if (queue->nvme_sq.ctrl != ctrl) continue; list_del_init(&queue->queue_list); __nvmet_rdma_queue_disconnect(queue); } mutex_unlock(&nvmet_rdma_queue_mutex); } static void nvmet_rdma_destroy_port_queues(struct nvmet_rdma_port *port) { struct nvmet_rdma_queue *queue, *tmp; struct nvmet_port *nport = port->nport; mutex_lock(&nvmet_rdma_queue_mutex); list_for_each_entry_safe(queue, tmp, &nvmet_rdma_queue_list, queue_list) { if (queue->port != nport) continue; list_del_init(&queue->queue_list); __nvmet_rdma_queue_disconnect(queue); } mutex_unlock(&nvmet_rdma_queue_mutex); } static void nvmet_rdma_disable_port(struct nvmet_rdma_port *port) { struct rdma_cm_id *cm_id = xchg(&port->cm_id, NULL); if (cm_id) rdma_destroy_id(cm_id); /* * Destroy the remaining queues, which are not belong to any * controller yet. Do it here after the RDMA-CM was destroyed * guarantees that no new queue will be created. */ nvmet_rdma_destroy_port_queues(port); } static int nvmet_rdma_enable_port(struct nvmet_rdma_port *port) { struct sockaddr *addr = (struct sockaddr *)&port->addr; struct rdma_cm_id *cm_id; int ret; cm_id = rdma_create_id(&init_net, nvmet_rdma_cm_handler, port, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(cm_id)) { pr_err("CM ID creation failed\n"); return PTR_ERR(cm_id); } /* * Allow both IPv4 and IPv6 sockets to bind a single port * at the same time. */ ret = rdma_set_afonly(cm_id, 1); if (ret) { pr_err("rdma_set_afonly failed (%d)\n", ret); goto out_destroy_id; } ret = rdma_bind_addr(cm_id, addr); if (ret) { pr_err("binding CM ID to %pISpcs failed (%d)\n", addr, ret); goto out_destroy_id; } ret = rdma_listen(cm_id, NVMET_RDMA_BACKLOG); if (ret) { pr_err("listening to %pISpcs failed (%d)\n", addr, ret); goto out_destroy_id; } port->cm_id = cm_id; return 0; out_destroy_id: rdma_destroy_id(cm_id); return ret; } static void nvmet_rdma_repair_port_work(struct work_struct *w) { struct nvmet_rdma_port *port = container_of(to_delayed_work(w), struct nvmet_rdma_port, repair_work); int ret; nvmet_rdma_disable_port(port); ret = nvmet_rdma_enable_port(port); if (ret) queue_delayed_work(nvmet_wq, &port->repair_work, 5 * HZ); } static int nvmet_rdma_add_port(struct nvmet_port *nport) { struct nvmet_rdma_port *port; __kernel_sa_family_t af; int ret; port = kzalloc(sizeof(*port), GFP_KERNEL); if (!port) return -ENOMEM; nport->priv = port; port->nport = nport; INIT_DELAYED_WORK(&port->repair_work, nvmet_rdma_repair_port_work); switch (nport->disc_addr.adrfam) { case NVMF_ADDR_FAMILY_IP4: af = AF_INET; break; case NVMF_ADDR_FAMILY_IP6: af = AF_INET6; break; default: pr_err("address family %d not supported\n", nport->disc_addr.adrfam); ret = -EINVAL; goto out_free_port; } if (nport->inline_data_size < 0) { nport->inline_data_size = NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE; } else if (nport->inline_data_size > NVMET_RDMA_MAX_INLINE_DATA_SIZE) { pr_warn("inline_data_size %u is too large, reducing to %u\n", nport->inline_data_size, NVMET_RDMA_MAX_INLINE_DATA_SIZE); nport->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE; } if (nport->max_queue_size < 0) { nport->max_queue_size = NVME_RDMA_DEFAULT_QUEUE_SIZE; } else if (nport->max_queue_size > NVME_RDMA_MAX_QUEUE_SIZE) { pr_warn("max_queue_size %u is too large, reducing to %u\n", nport->max_queue_size, NVME_RDMA_MAX_QUEUE_SIZE); nport->max_queue_size = NVME_RDMA_MAX_QUEUE_SIZE; } ret = inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr, nport->disc_addr.trsvcid, &port->addr); if (ret) { pr_err("malformed ip/port passed: %s:%s\n", nport->disc_addr.traddr, nport->disc_addr.trsvcid); goto out_free_port; } ret = nvmet_rdma_enable_port(port); if (ret) goto out_free_port; pr_info("enabling port %d (%pISpcs)\n", le16_to_cpu(nport->disc_addr.portid), (struct sockaddr *)&port->addr); return 0; out_free_port: kfree(port); return ret; } static void nvmet_rdma_remove_port(struct nvmet_port *nport) { struct nvmet_rdma_port *port = nport->priv; cancel_delayed_work_sync(&port->repair_work); nvmet_rdma_disable_port(port); kfree(port); } static void nvmet_rdma_disc_port_addr(struct nvmet_req *req, struct nvmet_port *nport, char *traddr) { struct nvmet_rdma_port *port = nport->priv; struct rdma_cm_id *cm_id = port->cm_id; if (inet_addr_is_any((struct sockaddr *)&cm_id->route.addr.src_addr)) { struct nvmet_rdma_rsp *rsp = container_of(req, struct nvmet_rdma_rsp, req); struct rdma_cm_id *req_cm_id = rsp->queue->cm_id; struct sockaddr *addr = (void *)&req_cm_id->route.addr.src_addr; sprintf(traddr, "%pISc", addr); } else { memcpy(traddr, nport->disc_addr.traddr, NVMF_TRADDR_SIZE); } } static ssize_t nvmet_rdma_host_port_addr(struct nvmet_ctrl *ctrl, char *traddr, size_t traddr_len) { struct nvmet_sq *nvme_sq = ctrl->sqs[0]; struct nvmet_rdma_queue *queue = container_of(nvme_sq, struct nvmet_rdma_queue, nvme_sq); return snprintf(traddr, traddr_len, "%pISc", (struct sockaddr *)&queue->cm_id->route.addr.dst_addr); } static u8 nvmet_rdma_get_mdts(const struct nvmet_ctrl *ctrl) { if (ctrl->pi_support) return NVMET_RDMA_MAX_METADATA_MDTS; return NVMET_RDMA_MAX_MDTS; } static u16 nvmet_rdma_get_max_queue_size(const struct nvmet_ctrl *ctrl) { if (ctrl->pi_support) return NVME_RDMA_MAX_METADATA_QUEUE_SIZE; return NVME_RDMA_MAX_QUEUE_SIZE; } static const struct nvmet_fabrics_ops nvmet_rdma_ops = { .owner = THIS_MODULE, .type = NVMF_TRTYPE_RDMA, .msdbd = 1, .flags = NVMF_KEYED_SGLS | NVMF_METADATA_SUPPORTED, .add_port = nvmet_rdma_add_port, .remove_port = nvmet_rdma_remove_port, .queue_response = nvmet_rdma_queue_response, .delete_ctrl = nvmet_rdma_delete_ctrl, .disc_traddr = nvmet_rdma_disc_port_addr, .host_traddr = nvmet_rdma_host_port_addr, .get_mdts = nvmet_rdma_get_mdts, .get_max_queue_size = nvmet_rdma_get_max_queue_size, }; static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data) { struct nvmet_rdma_queue *queue, *tmp; struct nvmet_rdma_device *ndev; bool found = false; mutex_lock(&device_list_mutex); list_for_each_entry(ndev, &device_list, entry) { if (ndev->device == ib_device) { found = true; break; } } mutex_unlock(&device_list_mutex); if (!found) return; /* * IB Device that is used by nvmet controllers is being removed, * delete all queues using this device. */ mutex_lock(&nvmet_rdma_queue_mutex); list_for_each_entry_safe(queue, tmp, &nvmet_rdma_queue_list, queue_list) { if (queue->dev->device != ib_device) continue; pr_info("Removing queue %d\n", queue->idx); list_del_init(&queue->queue_list); __nvmet_rdma_queue_disconnect(queue); } mutex_unlock(&nvmet_rdma_queue_mutex); flush_workqueue(nvmet_wq); } static struct ib_client nvmet_rdma_ib_client = { .name = "nvmet_rdma", .remove = nvmet_rdma_remove_one }; static int __init nvmet_rdma_init(void) { int ret; ret = ib_register_client(&nvmet_rdma_ib_client); if (ret) return ret; ret = nvmet_register_transport(&nvmet_rdma_ops); if (ret) goto err_ib_client; return 0; err_ib_client: ib_unregister_client(&nvmet_rdma_ib_client); return ret; } static void __exit nvmet_rdma_exit(void) { nvmet_unregister_transport(&nvmet_rdma_ops); ib_unregister_client(&nvmet_rdma_ib_client); WARN_ON_ONCE(!list_empty(&nvmet_rdma_queue_list)); ida_destroy(&nvmet_rdma_queue_ida); } module_init(nvmet_rdma_init); module_exit(nvmet_rdma_exit); MODULE_DESCRIPTION("NVMe target RDMA transport driver"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS("nvmet-transport-1"); /* 1 == NVMF_TRTYPE_RDMA */
8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ /* Fredy Neeser */ /* Greg Joyce <greg@opengridcomputing.com> */ /* Copyright (c) 2008-2019, IBM Corporation */ /* Copyright (c) 2017, Open Grid Computing, Inc. */ #include <linux/errno.h> #include <linux/types.h> #include <linux/net.h> #include <linux/inetdevice.h> #include <net/addrconf.h> #include <linux/workqueue.h> #include <net/sock.h> #include <net/tcp.h> #include <linux/inet.h> #include <linux/tcp.h> #include <trace/events/sock.h> #include <rdma/iw_cm.h> #include <rdma/ib_verbs.h> #include <rdma/ib_user_verbs.h> #include "siw.h" #include "siw_cm.h" /* * Set to any combination of * MPA_V2_RDMA_NO_RTR, MPA_V2_RDMA_READ_RTR, MPA_V2_RDMA_WRITE_RTR */ static __be16 rtr_type = MPA_V2_RDMA_READ_RTR | MPA_V2_RDMA_WRITE_RTR; static const bool relaxed_ird_negotiation = true; static void siw_cm_llp_state_change(struct sock *s); static void siw_cm_llp_data_ready(struct sock *s); static void siw_cm_llp_write_space(struct sock *s); static void siw_cm_llp_error_report(struct sock *s); static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason, int status); static void siw_sk_assign_cm_upcalls(struct sock *sk) { struct siw_cep *cep = sk_to_cep(sk); write_lock_bh(&sk->sk_callback_lock); cep->sk_state_change = sk->sk_state_change; cep->sk_data_ready = sk->sk_data_ready; cep->sk_write_space = sk->sk_write_space; cep->sk_error_report = sk->sk_error_report; sk->sk_state_change = siw_cm_llp_state_change; sk->sk_data_ready = siw_cm_llp_data_ready; sk->sk_write_space = siw_cm_llp_write_space; sk->sk_error_report = siw_cm_llp_error_report; write_unlock_bh(&sk->sk_callback_lock); } static void siw_sk_restore_upcalls(struct sock *sk, struct siw_cep *cep) { sk->sk_state_change = cep->sk_state_change; sk->sk_data_ready = cep->sk_data_ready; sk->sk_write_space = cep->sk_write_space; sk->sk_error_report = cep->sk_error_report; sk->sk_user_data = NULL; } static void siw_qp_socket_assoc(struct siw_cep *cep, struct siw_qp *qp) { struct socket *s = cep->sock; struct sock *sk = s->sk; write_lock_bh(&sk->sk_callback_lock); qp->attrs.sk = s; sk->sk_data_ready = siw_qp_llp_data_ready; sk->sk_write_space = siw_qp_llp_write_space; write_unlock_bh(&sk->sk_callback_lock); } static void siw_socket_disassoc(struct socket *s) { struct sock *sk = s->sk; struct siw_cep *cep; if (sk) { write_lock_bh(&sk->sk_callback_lock); cep = sk_to_cep(sk); if (cep) { siw_sk_restore_upcalls(sk, cep); siw_cep_put(cep); } else { pr_warn("siw: cannot restore sk callbacks: no ep\n"); } write_unlock_bh(&sk->sk_callback_lock); } else { pr_warn("siw: cannot restore sk callbacks: no sk\n"); } } static void siw_rtr_data_ready(struct sock *sk) { struct siw_cep *cep; struct siw_qp *qp = NULL; read_descriptor_t rd_desc; trace_sk_data_ready(sk); read_lock(&sk->sk_callback_lock); cep = sk_to_cep(sk); if (!cep) { WARN(1, "No connection endpoint\n"); goto out; } qp = sk_to_qp(sk); memset(&rd_desc, 0, sizeof(rd_desc)); rd_desc.arg.data = qp; rd_desc.count = 1; tcp_read_sock(sk, &rd_desc, siw_tcp_rx_data); /* * Check if first frame was successfully processed. * Signal connection full establishment if yes. * Failed data processing would have already scheduled * connection drop. */ if (!qp->rx_stream.rx_suspend) siw_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0); out: read_unlock(&sk->sk_callback_lock); if (qp) siw_qp_socket_assoc(cep, qp); } static void siw_sk_assign_rtr_upcalls(struct siw_cep *cep) { struct sock *sk = cep->sock->sk; write_lock_bh(&sk->sk_callback_lock); sk->sk_data_ready = siw_rtr_data_ready; sk->sk_write_space = siw_qp_llp_write_space; write_unlock_bh(&sk->sk_callback_lock); } static void siw_cep_socket_assoc(struct siw_cep *cep, struct socket *s) { cep->sock = s; siw_cep_get(cep); s->sk->sk_user_data = cep; siw_sk_assign_cm_upcalls(s->sk); } static struct siw_cep *siw_cep_alloc(struct siw_device *sdev) { struct siw_cep *cep = kzalloc(sizeof(*cep), GFP_KERNEL); unsigned long flags; if (!cep) return NULL; INIT_LIST_HEAD(&cep->listenq); INIT_LIST_HEAD(&cep->devq); INIT_LIST_HEAD(&cep->work_freelist); kref_init(&cep->ref); cep->state = SIW_EPSTATE_IDLE; init_waitqueue_head(&cep->waitq); spin_lock_init(&cep->lock); cep->sdev = sdev; cep->enhanced_rdma_conn_est = false; spin_lock_irqsave(&sdev->lock, flags); list_add_tail(&cep->devq, &sdev->cep_list); spin_unlock_irqrestore(&sdev->lock, flags); siw_dbg_cep(cep, "new endpoint\n"); return cep; } static void siw_cm_free_work(struct siw_cep *cep) { struct list_head *w, *tmp; struct siw_cm_work *work; list_for_each_safe(w, tmp, &cep->work_freelist) { work = list_entry(w, struct siw_cm_work, list); list_del(&work->list); kfree(work); } } static void siw_cancel_mpatimer(struct siw_cep *cep) { spin_lock_bh(&cep->lock); if (cep->mpa_timer) { if (cancel_delayed_work(&cep->mpa_timer->work)) { siw_cep_put(cep); kfree(cep->mpa_timer); /* not needed again */ } cep->mpa_timer = NULL; } spin_unlock_bh(&cep->lock); } static void siw_put_work(struct siw_cm_work *work) { INIT_LIST_HEAD(&work->list); spin_lock_bh(&work->cep->lock); list_add(&work->list, &work->cep->work_freelist); spin_unlock_bh(&work->cep->lock); } static void siw_cep_set_inuse(struct siw_cep *cep) { unsigned long flags; retry: spin_lock_irqsave(&cep->lock, flags); if (cep->in_use) { spin_unlock_irqrestore(&cep->lock, flags); wait_event_interruptible(cep->waitq, !cep->in_use); if (signal_pending(current)) flush_signals(current); goto retry; } else { cep->in_use = 1; spin_unlock_irqrestore(&cep->lock, flags); } } static void siw_cep_set_free(struct siw_cep *cep) { unsigned long flags; spin_lock_irqsave(&cep->lock, flags); cep->in_use = 0; spin_unlock_irqrestore(&cep->lock, flags); wake_up(&cep->waitq); } static void __siw_cep_dealloc(struct kref *ref) { struct siw_cep *cep = container_of(ref, struct siw_cep, ref); struct siw_device *sdev = cep->sdev; unsigned long flags; WARN_ON(cep->listen_cep); /* kfree(NULL) is safe */ kfree(cep->mpa.pdata); spin_lock_bh(&cep->lock); if (!list_empty(&cep->work_freelist)) siw_cm_free_work(cep); spin_unlock_bh(&cep->lock); spin_lock_irqsave(&sdev->lock, flags); list_del(&cep->devq); spin_unlock_irqrestore(&sdev->lock, flags); siw_dbg_cep(cep, "free endpoint\n"); kfree(cep); } static struct siw_cm_work *siw_get_work(struct siw_cep *cep) { struct siw_cm_work *work = NULL; spin_lock_bh(&cep->lock); if (!list_empty(&cep->work_freelist)) { work = list_entry(cep->work_freelist.next, struct siw_cm_work, list); list_del_init(&work->list); } spin_unlock_bh(&cep->lock); return work; } static int siw_cm_alloc_work(struct siw_cep *cep, int num) { struct siw_cm_work *work; while (num--) { work = kmalloc(sizeof(*work), GFP_KERNEL); if (!work) { if (!(list_empty(&cep->work_freelist))) siw_cm_free_work(cep); return -ENOMEM; } work->cep = cep; INIT_LIST_HEAD(&work->list); list_add(&work->list, &cep->work_freelist); } return 0; } /* * siw_cm_upcall() * * Upcall to IWCM to inform about async connection events */ static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason, int status) { struct iw_cm_event event; struct iw_cm_id *id; memset(&event, 0, sizeof(event)); event.status = status; event.event = reason; if (reason == IW_CM_EVENT_CONNECT_REQUEST) { event.provider_data = cep; id = cep->listen_cep->cm_id; } else { id = cep->cm_id; } /* Signal IRD and ORD */ if (reason == IW_CM_EVENT_ESTABLISHED || reason == IW_CM_EVENT_CONNECT_REPLY) { /* Signal negotiated IRD/ORD values we will use */ event.ird = cep->ird; event.ord = cep->ord; } else if (reason == IW_CM_EVENT_CONNECT_REQUEST) { event.ird = cep->ord; event.ord = cep->ird; } /* Signal private data and address information */ if (reason == IW_CM_EVENT_CONNECT_REQUEST || reason == IW_CM_EVENT_CONNECT_REPLY) { u16 pd_len = be16_to_cpu(cep->mpa.hdr.params.pd_len); if (pd_len) { /* * hand over MPA private data */ event.private_data_len = pd_len; event.private_data = cep->mpa.pdata; /* Hide MPA V2 IRD/ORD control */ if (cep->enhanced_rdma_conn_est) { event.private_data_len -= sizeof(struct mpa_v2_data); event.private_data += sizeof(struct mpa_v2_data); } } getname_local(cep->sock, &event.local_addr); getname_peer(cep->sock, &event.remote_addr); } siw_dbg_cep(cep, "[QP %u]: reason=%d, status=%d\n", cep->qp ? qp_id(cep->qp) : UINT_MAX, reason, status); return id->event_handler(id, &event); } static void siw_free_cm_id(struct siw_cep *cep) { if (!cep->cm_id) return; cep->cm_id->rem_ref(cep->cm_id); cep->cm_id = NULL; } static void siw_destroy_cep_sock(struct siw_cep *cep) { if (cep->sock) { siw_socket_disassoc(cep->sock); sock_release(cep->sock); cep->sock = NULL; } } /* * siw_qp_cm_drop() * * Drops established LLP connection if present and not already * scheduled for dropping. Called from user context, SQ workqueue * or receive IRQ. Caller signals if socket can be immediately * closed (basically, if not in IRQ). */ void siw_qp_cm_drop(struct siw_qp *qp, int schedule) { struct siw_cep *cep = qp->cep; qp->rx_stream.rx_suspend = 1; qp->tx_ctx.tx_suspend = 1; if (!qp->cep) return; if (schedule) { siw_cm_queue_work(cep, SIW_CM_WORK_CLOSE_LLP); } else { siw_cep_set_inuse(cep); if (cep->state == SIW_EPSTATE_CLOSED) { siw_dbg_cep(cep, "already closed\n"); goto out; } siw_dbg_cep(cep, "immediate close, state %d\n", cep->state); siw_send_terminate(qp); if (cep->cm_id) { switch (cep->state) { case SIW_EPSTATE_AWAIT_MPAREP: siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -EINVAL); break; case SIW_EPSTATE_RDMA_MODE: siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); break; case SIW_EPSTATE_IDLE: case SIW_EPSTATE_LISTENING: case SIW_EPSTATE_CONNECTING: case SIW_EPSTATE_AWAIT_MPAREQ: case SIW_EPSTATE_RECVD_MPAREQ: case SIW_EPSTATE_CLOSED: default: break; } siw_free_cm_id(cep); siw_cep_put(cep); } cep->state = SIW_EPSTATE_CLOSED; siw_destroy_cep_sock(cep); if (cep->qp) { cep->qp = NULL; siw_qp_put(qp); } out: siw_cep_set_free(cep); } } void siw_cep_put(struct siw_cep *cep) { WARN_ON(kref_read(&cep->ref) < 1); kref_put(&cep->ref, __siw_cep_dealloc); } static void siw_cep_set_free_and_put(struct siw_cep *cep) { siw_cep_set_free(cep); siw_cep_put(cep); } void siw_cep_get(struct siw_cep *cep) { kref_get(&cep->ref); } /* * Expects params->pd_len in host byte order */ static int siw_send_mpareqrep(struct siw_cep *cep, const void *pdata, u8 pd_len) { struct socket *s = cep->sock; struct mpa_rr *rr = &cep->mpa.hdr; struct kvec iov[3]; struct msghdr msg; int rv; int iovec_num = 0; int mpa_len; memset(&msg, 0, sizeof(msg)); iov[iovec_num].iov_base = rr; iov[iovec_num].iov_len = sizeof(*rr); mpa_len = sizeof(*rr); if (cep->enhanced_rdma_conn_est) { iovec_num++; iov[iovec_num].iov_base = &cep->mpa.v2_ctrl; iov[iovec_num].iov_len = sizeof(cep->mpa.v2_ctrl); mpa_len += sizeof(cep->mpa.v2_ctrl); } if (pd_len) { iovec_num++; iov[iovec_num].iov_base = (char *)pdata; iov[iovec_num].iov_len = pd_len; mpa_len += pd_len; } if (cep->enhanced_rdma_conn_est) pd_len += sizeof(cep->mpa.v2_ctrl); rr->params.pd_len = cpu_to_be16(pd_len); rv = kernel_sendmsg(s, &msg, iov, iovec_num + 1, mpa_len); return rv < 0 ? rv : 0; } /* * Receive MPA Request/Reply header. * * Returns 0 if complete MPA Request/Reply header including * eventual private data was received. Returns -EAGAIN if * header was partially received or negative error code otherwise. * * Context: May be called in process context only */ static int siw_recv_mpa_rr(struct siw_cep *cep) { struct mpa_rr *hdr = &cep->mpa.hdr; struct socket *s = cep->sock; u16 pd_len; int rcvd, to_rcv; if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr)) { rcvd = ksock_recv(s, (char *)hdr + cep->mpa.bytes_rcvd, sizeof(struct mpa_rr) - cep->mpa.bytes_rcvd, 0); if (rcvd <= 0) return -ECONNABORTED; cep->mpa.bytes_rcvd += rcvd; if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr)) return -EAGAIN; if (be16_to_cpu(hdr->params.pd_len) > MPA_MAX_PRIVDATA) return -EPROTO; } pd_len = be16_to_cpu(hdr->params.pd_len); /* * At least the MPA Request/Reply header (frame not including * private data) has been received. * Receive (or continue receiving) any private data. */ to_rcv = pd_len - (cep->mpa.bytes_rcvd - sizeof(struct mpa_rr)); if (!to_rcv) { /* * We must have hdr->params.pd_len == 0 and thus received a * complete MPA Request/Reply frame. * Check against peer protocol violation. */ u32 word; rcvd = ksock_recv(s, (char *)&word, sizeof(word), MSG_DONTWAIT); if (rcvd == -EAGAIN) return 0; if (rcvd == 0) { siw_dbg_cep(cep, "peer EOF\n"); return -EPIPE; } if (rcvd < 0) { siw_dbg_cep(cep, "error: %d\n", rcvd); return rcvd; } siw_dbg_cep(cep, "peer sent extra data: %d\n", rcvd); return -EPROTO; } /* * At this point, we must have hdr->params.pd_len != 0. * A private data buffer gets allocated if hdr->params.pd_len != 0. */ if (!cep->mpa.pdata) { cep->mpa.pdata = kmalloc(pd_len + 4, GFP_KERNEL); if (!cep->mpa.pdata) return -ENOMEM; } rcvd = ksock_recv( s, cep->mpa.pdata + cep->mpa.bytes_rcvd - sizeof(struct mpa_rr), to_rcv + 4, MSG_DONTWAIT); if (rcvd < 0) return rcvd; if (rcvd > to_rcv) return -EPROTO; cep->mpa.bytes_rcvd += rcvd; if (to_rcv == rcvd) { siw_dbg_cep(cep, "%d bytes private data received\n", pd_len); return 0; } return -EAGAIN; } /* * siw_proc_mpareq() * * Read MPA Request from socket and signal new connection to IWCM * if success. Caller must hold lock on corresponding listening CEP. */ static int siw_proc_mpareq(struct siw_cep *cep) { struct mpa_rr *req; int version, rv; u16 pd_len; rv = siw_recv_mpa_rr(cep); if (rv) return rv; req = &cep->mpa.hdr; version = __mpa_rr_revision(req->params.bits); pd_len = be16_to_cpu(req->params.pd_len); if (version > MPA_REVISION_2) /* allow for 0, 1, and 2 only */ return -EPROTO; if (memcmp(req->key, MPA_KEY_REQ, 16)) return -EPROTO; /* Prepare for sending MPA reply */ memcpy(req->key, MPA_KEY_REP, 16); if (version == MPA_REVISION_2 && (req->params.bits & MPA_RR_FLAG_ENHANCED)) { /* * MPA version 2 must signal IRD/ORD values and P2P mode * in private data if header flag MPA_RR_FLAG_ENHANCED * is set. */ if (pd_len < sizeof(struct mpa_v2_data)) goto reject_conn; cep->enhanced_rdma_conn_est = true; } /* MPA Markers: currently not supported. Marker TX to be added. */ if (req->params.bits & MPA_RR_FLAG_MARKERS) goto reject_conn; if (req->params.bits & MPA_RR_FLAG_CRC) { /* * RFC 5044, page 27: CRC MUST be used if peer requests it. * siw specific: 'mpa_crc_strict' parameter to reject * connection with CRC if local CRC off enforced by * 'mpa_crc_strict' module parameter. */ if (!mpa_crc_required && mpa_crc_strict) goto reject_conn; /* Enable CRC if requested by module parameter */ if (mpa_crc_required) req->params.bits |= MPA_RR_FLAG_CRC; } if (cep->enhanced_rdma_conn_est) { struct mpa_v2_data *v2 = (struct mpa_v2_data *)cep->mpa.pdata; /* * Peer requested ORD becomes requested local IRD, * peer requested IRD becomes requested local ORD. * IRD and ORD get limited by global maximum values. */ cep->ord = ntohs(v2->ird) & MPA_IRD_ORD_MASK; cep->ord = min(cep->ord, SIW_MAX_ORD_QP); cep->ird = ntohs(v2->ord) & MPA_IRD_ORD_MASK; cep->ird = min(cep->ird, SIW_MAX_IRD_QP); /* May get overwritten by locally negotiated values */ cep->mpa.v2_ctrl.ird = htons(cep->ird); cep->mpa.v2_ctrl.ord = htons(cep->ord); /* * Support for peer sent zero length Write or Read to * let local side enter RTS. Writes are preferred. * Sends would require pre-posting a Receive and are * not supported. * Propose zero length Write if none of Read and Write * is indicated. */ if (v2->ird & MPA_V2_PEER_TO_PEER) { cep->mpa.v2_ctrl.ird |= MPA_V2_PEER_TO_PEER; if (v2->ord & MPA_V2_RDMA_WRITE_RTR) cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_WRITE_RTR; else if (v2->ord & MPA_V2_RDMA_READ_RTR) cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_READ_RTR; else cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_WRITE_RTR; } } cep->state = SIW_EPSTATE_RECVD_MPAREQ; /* Keep reference until IWCM accepts/rejects */ siw_cep_get(cep); rv = siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REQUEST, 0); if (rv) siw_cep_put(cep); return rv; reject_conn: siw_dbg_cep(cep, "reject: crc %d:%d:%d, m %d:%d\n", req->params.bits & MPA_RR_FLAG_CRC ? 1 : 0, mpa_crc_required, mpa_crc_strict, req->params.bits & MPA_RR_FLAG_MARKERS ? 1 : 0, 0); req->params.bits &= ~MPA_RR_FLAG_MARKERS; req->params.bits |= MPA_RR_FLAG_REJECT; if (!mpa_crc_required && mpa_crc_strict) req->params.bits &= ~MPA_RR_FLAG_CRC; if (pd_len) kfree(cep->mpa.pdata); cep->mpa.pdata = NULL; siw_send_mpareqrep(cep, NULL, 0); return -EOPNOTSUPP; } static int siw_proc_mpareply(struct siw_cep *cep) { struct siw_qp_attrs qp_attrs; enum siw_qp_attr_mask qp_attr_mask; struct siw_qp *qp = cep->qp; struct mpa_rr *rep; int rv; u16 rep_ord; u16 rep_ird; bool ird_insufficient = false; enum mpa_v2_ctrl mpa_p2p_mode = MPA_V2_RDMA_NO_RTR; rv = siw_recv_mpa_rr(cep); if (rv) goto out_err; siw_cancel_mpatimer(cep); rep = &cep->mpa.hdr; if (__mpa_rr_revision(rep->params.bits) > MPA_REVISION_2) { /* allow for 0, 1, and 2 only */ rv = -EPROTO; goto out_err; } if (memcmp(rep->key, MPA_KEY_REP, 16)) { siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, LLP_ETYPE_MPA, LLP_ECODE_INVALID_REQ_RESP, 0); siw_send_terminate(qp); rv = -EPROTO; goto out_err; } if (rep->params.bits & MPA_RR_FLAG_REJECT) { siw_dbg_cep(cep, "got mpa reject\n"); siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNRESET); return -ECONNRESET; } if (try_gso && rep->params.bits & MPA_RR_FLAG_GSO_EXP) { siw_dbg_cep(cep, "peer allows GSO on TX\n"); qp->tx_ctx.gso_seg_limit = 0; } if ((rep->params.bits & MPA_RR_FLAG_MARKERS) || (mpa_crc_required && !(rep->params.bits & MPA_RR_FLAG_CRC)) || (mpa_crc_strict && !mpa_crc_required && (rep->params.bits & MPA_RR_FLAG_CRC))) { siw_dbg_cep(cep, "reply unsupp: crc %d:%d:%d, m %d:%d\n", rep->params.bits & MPA_RR_FLAG_CRC ? 1 : 0, mpa_crc_required, mpa_crc_strict, rep->params.bits & MPA_RR_FLAG_MARKERS ? 1 : 0, 0); siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNREFUSED); return -EINVAL; } if (cep->enhanced_rdma_conn_est) { struct mpa_v2_data *v2; if (__mpa_rr_revision(rep->params.bits) < MPA_REVISION_2 || !(rep->params.bits & MPA_RR_FLAG_ENHANCED)) { /* * Protocol failure: The responder MUST reply with * MPA version 2 and MUST set MPA_RR_FLAG_ENHANCED. */ siw_dbg_cep(cep, "mpa reply error: vers %d, enhcd %d\n", __mpa_rr_revision(rep->params.bits), rep->params.bits & MPA_RR_FLAG_ENHANCED ? 1 : 0); siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNRESET); return -EINVAL; } v2 = (struct mpa_v2_data *)cep->mpa.pdata; rep_ird = ntohs(v2->ird) & MPA_IRD_ORD_MASK; rep_ord = ntohs(v2->ord) & MPA_IRD_ORD_MASK; if (cep->ird < rep_ord && (relaxed_ird_negotiation == false || rep_ord > cep->sdev->attrs.max_ird)) { siw_dbg_cep(cep, "ird %d, rep_ord %d, max_ord %d\n", cep->ird, rep_ord, cep->sdev->attrs.max_ord); ird_insufficient = true; } if (cep->ord > rep_ird && relaxed_ird_negotiation == false) { siw_dbg_cep(cep, "ord %d, rep_ird %d\n", cep->ord, rep_ird); ird_insufficient = true; } /* * Always report negotiated peer values to user, * even if IRD/ORD negotiation failed */ cep->ird = rep_ord; cep->ord = rep_ird; if (ird_insufficient) { /* * If the initiator IRD is insuffient for the * responder ORD, send a TERM. */ siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, LLP_ETYPE_MPA, LLP_ECODE_INSUFFICIENT_IRD, 0); siw_send_terminate(qp); rv = -ENOMEM; goto out_err; } if (cep->mpa.v2_ctrl_req.ird & MPA_V2_PEER_TO_PEER) mpa_p2p_mode = cep->mpa.v2_ctrl_req.ord & (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR); /* * Check if we requested P2P mode, and if peer agrees */ if (mpa_p2p_mode != MPA_V2_RDMA_NO_RTR) { if ((mpa_p2p_mode & v2->ord) == 0) { /* * We requested RTR mode(s), but the peer * did not pick any mode we support. */ siw_dbg_cep(cep, "rtr mode: req %2x, got %2x\n", mpa_p2p_mode, v2->ord & (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR)); siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, LLP_ETYPE_MPA, LLP_ECODE_NO_MATCHING_RTR, 0); siw_send_terminate(qp); rv = -EPROTO; goto out_err; } mpa_p2p_mode = v2->ord & (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR); } } memset(&qp_attrs, 0, sizeof(qp_attrs)); if (rep->params.bits & MPA_RR_FLAG_CRC) qp_attrs.flags = SIW_MPA_CRC; qp_attrs.irq_size = cep->ird; qp_attrs.orq_size = cep->ord; qp_attrs.sk = cep->sock; qp_attrs.state = SIW_QP_STATE_RTS; qp_attr_mask = SIW_QP_ATTR_STATE | SIW_QP_ATTR_LLP_HANDLE | SIW_QP_ATTR_ORD | SIW_QP_ATTR_IRD | SIW_QP_ATTR_MPA; /* Move socket RX/TX under QP control */ down_write(&qp->state_lock); if (qp->attrs.state > SIW_QP_STATE_RTR) { rv = -EINVAL; up_write(&qp->state_lock); goto out_err; } rv = siw_qp_modify(qp, &qp_attrs, qp_attr_mask); siw_qp_socket_assoc(cep, qp); up_write(&qp->state_lock); /* Send extra RDMA frame to trigger peer RTS if negotiated */ if (mpa_p2p_mode != MPA_V2_RDMA_NO_RTR) { rv = siw_qp_mpa_rts(qp, mpa_p2p_mode); if (rv) goto out_err; } if (!rv) { rv = siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, 0); if (!rv) cep->state = SIW_EPSTATE_RDMA_MODE; return 0; } out_err: if (rv != -EAGAIN) siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -EINVAL); return rv; } /* * siw_accept_newconn - accept an incoming pending connection * */ static void siw_accept_newconn(struct siw_cep *cep) { struct socket *s = cep->sock; struct socket *new_s = NULL; struct siw_cep *new_cep = NULL; int rv = 0; /* debug only. should disappear */ if (cep->state != SIW_EPSTATE_LISTENING) goto error; new_cep = siw_cep_alloc(cep->sdev); if (!new_cep) goto error; /* * 4: Allocate a sufficient number of work elements * to allow concurrent handling of local + peer close * events, MPA header processing + MPA timeout. */ if (siw_cm_alloc_work(new_cep, 4) != 0) goto error; /* * Copy saved socket callbacks from listening CEP * and assign new socket with new CEP */ new_cep->sk_state_change = cep->sk_state_change; new_cep->sk_data_ready = cep->sk_data_ready; new_cep->sk_write_space = cep->sk_write_space; new_cep->sk_error_report = cep->sk_error_report; rv = kernel_accept(s, &new_s, O_NONBLOCK); if (rv != 0) { /* * Connection already aborted by peer..? */ siw_dbg_cep(cep, "kernel_accept() error: %d\n", rv); goto error; } new_cep->sock = new_s; siw_cep_get(new_cep); new_s->sk->sk_user_data = new_cep; if (siw_tcp_nagle == false) tcp_sock_set_nodelay(new_s->sk); new_cep->state = SIW_EPSTATE_AWAIT_MPAREQ; rv = siw_cm_queue_work(new_cep, SIW_CM_WORK_MPATIMEOUT); if (rv) goto error; /* * See siw_proc_mpareq() etc. for the use of new_cep->listen_cep. */ new_cep->listen_cep = cep; siw_cep_get(cep); if (atomic_read(&new_s->sk->sk_rmem_alloc)) { /* * MPA REQ already queued */ siw_dbg_cep(cep, "immediate mpa request\n"); siw_cep_set_inuse(new_cep); rv = siw_proc_mpareq(new_cep); if (rv != -EAGAIN) { siw_cep_put(cep); new_cep->listen_cep = NULL; if (rv) { siw_cancel_mpatimer(new_cep); siw_cep_set_free(new_cep); goto error; } } siw_cep_set_free(new_cep); } return; error: if (new_cep) siw_cep_put(new_cep); if (new_s) { siw_socket_disassoc(new_s); sock_release(new_s); new_cep->sock = NULL; } siw_dbg_cep(cep, "error %d\n", rv); } static void siw_cm_work_handler(struct work_struct *w) { struct siw_cm_work *work; struct siw_cep *cep; int release_cep = 0, rv = 0; work = container_of(w, struct siw_cm_work, work.work); cep = work->cep; siw_dbg_cep(cep, "[QP %u]: work type: %d, state %d\n", cep->qp ? qp_id(cep->qp) : UINT_MAX, work->type, cep->state); siw_cep_set_inuse(cep); switch (work->type) { case SIW_CM_WORK_ACCEPT: siw_accept_newconn(cep); break; case SIW_CM_WORK_READ_MPAHDR: if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) { if (cep->listen_cep) { siw_cep_set_inuse(cep->listen_cep); if (cep->listen_cep->state == SIW_EPSTATE_LISTENING) rv = siw_proc_mpareq(cep); else rv = -EFAULT; siw_cep_set_free(cep->listen_cep); if (rv != -EAGAIN) { siw_cep_put(cep->listen_cep); cep->listen_cep = NULL; if (rv) siw_cep_put(cep); } } } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) { rv = siw_proc_mpareply(cep); } else { /* * CEP already moved out of MPA handshake. * any connection management already done. * silently ignore the mpa packet. */ if (cep->state == SIW_EPSTATE_RDMA_MODE) { cep->sock->sk->sk_data_ready(cep->sock->sk); siw_dbg_cep(cep, "already in RDMA mode"); } else { siw_dbg_cep(cep, "out of state: %d\n", cep->state); } } if (rv && rv != -EAGAIN) release_cep = 1; break; case SIW_CM_WORK_CLOSE_LLP: /* * QP scheduled LLP close */ if (cep->qp) siw_send_terminate(cep->qp); if (cep->cm_id) siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); release_cep = 1; break; case SIW_CM_WORK_PEER_CLOSE: if (cep->cm_id) { if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) { /* * MPA reply not received, but connection drop */ siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNRESET); } else if (cep->state == SIW_EPSTATE_RDMA_MODE) { /* * NOTE: IW_CM_EVENT_DISCONNECT is given just * to transition IWCM into CLOSING. */ siw_cm_upcall(cep, IW_CM_EVENT_DISCONNECT, 0); siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); } /* * for other states there is no connection * known to the IWCM. */ } else { if (cep->state == SIW_EPSTATE_RECVD_MPAREQ) { /* * Wait for the ulp/CM to call accept/reject */ siw_dbg_cep(cep, "mpa req recvd, wait for ULP\n"); } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) { /* * Socket close before MPA request received. */ if (cep->listen_cep) { siw_dbg_cep(cep, "no mpareq: drop listener\n"); siw_cep_put(cep->listen_cep); cep->listen_cep = NULL; } } } release_cep = 1; break; case SIW_CM_WORK_MPATIMEOUT: cep->mpa_timer = NULL; if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) { /* * MPA request timed out: * Hide any partially received private data and signal * timeout */ cep->mpa.hdr.params.pd_len = 0; if (cep->cm_id) siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ETIMEDOUT); release_cep = 1; } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) { /* * No MPA request received after peer TCP stream setup. */ if (cep->listen_cep) { siw_cep_put(cep->listen_cep); cep->listen_cep = NULL; } release_cep = 1; } break; default: WARN(1, "Undefined CM work type: %d\n", work->type); } if (release_cep) { siw_dbg_cep(cep, "release: timer=%s, QP[%u]\n", cep->mpa_timer ? "y" : "n", cep->qp ? qp_id(cep->qp) : UINT_MAX); siw_cancel_mpatimer(cep); cep->state = SIW_EPSTATE_CLOSED; if (cep->qp) { struct siw_qp *qp = cep->qp; /* * Serialize a potential race with application * closing the QP and calling siw_qp_cm_drop() */ siw_qp_get(qp); siw_cep_set_free(cep); siw_qp_llp_close(qp); siw_qp_put(qp); siw_cep_set_inuse(cep); cep->qp = NULL; siw_qp_put(qp); } if (cep->sock) { siw_socket_disassoc(cep->sock); sock_release(cep->sock); cep->sock = NULL; } if (cep->cm_id) { siw_free_cm_id(cep); siw_cep_put(cep); } } siw_cep_set_free(cep); siw_put_work(work); siw_cep_put(cep); } static struct workqueue_struct *siw_cm_wq; int siw_cm_queue_work(struct siw_cep *cep, enum siw_work_type type) { struct siw_cm_work *work = siw_get_work(cep); unsigned long delay = 0; if (!work) { siw_dbg_cep(cep, "failed with no work available\n"); return -ENOMEM; } work->type = type; work->cep = cep; siw_cep_get(cep); INIT_DELAYED_WORK(&work->work, siw_cm_work_handler); if (type == SIW_CM_WORK_MPATIMEOUT) { cep->mpa_timer = work; if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) delay = MPAREQ_TIMEOUT; else delay = MPAREP_TIMEOUT; } siw_dbg_cep(cep, "[QP %u]: work type: %d, timeout %lu\n", cep->qp ? qp_id(cep->qp) : -1, type, delay); queue_delayed_work(siw_cm_wq, &work->work, delay); return 0; } static void siw_cm_llp_data_ready(struct sock *sk) { struct siw_cep *cep; trace_sk_data_ready(sk); read_lock(&sk->sk_callback_lock); cep = sk_to_cep(sk); if (!cep) goto out; siw_dbg_cep(cep, "cep state: %d, socket state %d\n", cep->state, sk->sk_state); if (sk->sk_state != TCP_ESTABLISHED) goto out; switch (cep->state) { case SIW_EPSTATE_RDMA_MODE: case SIW_EPSTATE_LISTENING: break; case SIW_EPSTATE_AWAIT_MPAREQ: case SIW_EPSTATE_AWAIT_MPAREP: siw_cm_queue_work(cep, SIW_CM_WORK_READ_MPAHDR); break; default: siw_dbg_cep(cep, "unexpected data, state %d\n", cep->state); break; } out: read_unlock(&sk->sk_callback_lock); } static void siw_cm_llp_write_space(struct sock *sk) { struct siw_cep *cep = sk_to_cep(sk); if (cep) siw_dbg_cep(cep, "state: %d\n", cep->state); } static void siw_cm_llp_error_report(struct sock *sk) { struct siw_cep *cep = sk_to_cep(sk); if (cep) { siw_dbg_cep(cep, "error %d, socket state: %d, cep state: %d\n", sk->sk_err, sk->sk_state, cep->state); cep->sk_error_report(sk); } } static void siw_cm_llp_state_change(struct sock *sk) { struct siw_cep *cep; void (*orig_state_change)(struct sock *s); read_lock(&sk->sk_callback_lock); cep = sk_to_cep(sk); if (!cep) { /* endpoint already disassociated */ read_unlock(&sk->sk_callback_lock); return; } orig_state_change = cep->sk_state_change; siw_dbg_cep(cep, "state: %d\n", cep->state); switch (sk->sk_state) { case TCP_ESTABLISHED: /* * handle accepting socket as special case where only * new connection is possible */ siw_cm_queue_work(cep, SIW_CM_WORK_ACCEPT); break; case TCP_CLOSE: case TCP_CLOSE_WAIT: if (cep->qp) cep->qp->tx_ctx.tx_suspend = 1; siw_cm_queue_work(cep, SIW_CM_WORK_PEER_CLOSE); break; default: siw_dbg_cep(cep, "unexpected socket state %d\n", sk->sk_state); } read_unlock(&sk->sk_callback_lock); orig_state_change(sk); } static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr, struct sockaddr *raddr, bool afonly) { int rv, flags = 0; size_t size = laddr->sa_family == AF_INET ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); /* * Make address available again asap. */ sock_set_reuseaddr(s->sk); if (afonly) { rv = ip6_sock_set_v6only(s->sk); if (rv) return rv; } rv = s->ops->bind(s, laddr, size); if (rv < 0) return rv; rv = s->ops->connect(s, raddr, size, flags); return rv < 0 ? rv : 0; } int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params) { struct siw_device *sdev = to_siw_dev(id->device); struct siw_qp *qp; struct siw_cep *cep = NULL; struct socket *s = NULL; struct sockaddr *laddr = (struct sockaddr *)&id->local_addr, *raddr = (struct sockaddr *)&id->remote_addr; bool p2p_mode = peer_to_peer, v4 = true; u16 pd_len = params->private_data_len; int version = mpa_version, rv; if (pd_len > MPA_MAX_PRIVDATA) return -EINVAL; if (params->ird > sdev->attrs.max_ird || params->ord > sdev->attrs.max_ord) return -ENOMEM; if (laddr->sa_family == AF_INET6) v4 = false; else if (laddr->sa_family != AF_INET) return -EAFNOSUPPORT; /* * Respect any iwarp port mapping: Use mapped remote address * if valid. Local address must not be mapped, since siw * uses kernel TCP stack. */ if ((v4 && to_sockaddr_in(id->remote_addr).sin_port != 0) || to_sockaddr_in6(id->remote_addr).sin6_port != 0) raddr = (struct sockaddr *)&id->m_remote_addr; qp = siw_qp_id2obj(sdev, params->qpn); if (!qp) { WARN(1, "[QP %u] does not exist\n", params->qpn); rv = -EINVAL; goto error; } siw_dbg_qp(qp, "pd_len %d, laddr %pISp, raddr %pISp\n", pd_len, laddr, raddr); rv = sock_create(v4 ? AF_INET : AF_INET6, SOCK_STREAM, IPPROTO_TCP, &s); if (rv < 0) goto error; /* * NOTE: For simplification, connect() is called in blocking * mode. Might be reconsidered for async connection setup at * TCP level. */ rv = kernel_bindconnect(s, laddr, raddr, id->afonly); if (rv != 0) { siw_dbg_qp(qp, "kernel_bindconnect: error %d\n", rv); goto error; } if (siw_tcp_nagle == false) tcp_sock_set_nodelay(s->sk); cep = siw_cep_alloc(sdev); if (!cep) { rv = -ENOMEM; goto error; } siw_cep_set_inuse(cep); /* Associate QP with CEP */ siw_cep_get(cep); qp->cep = cep; /* siw_qp_get(qp) already done by QP lookup */ cep->qp = qp; id->add_ref(id); cep->cm_id = id; /* * 4: Allocate a sufficient number of work elements * to allow concurrent handling of local + peer close * events, MPA header processing + MPA timeout. */ rv = siw_cm_alloc_work(cep, 4); if (rv != 0) { rv = -ENOMEM; goto error; } cep->ird = params->ird; cep->ord = params->ord; if (p2p_mode && cep->ord == 0) cep->ord = 1; cep->state = SIW_EPSTATE_CONNECTING; /* * Associate CEP with socket */ siw_cep_socket_assoc(cep, s); cep->state = SIW_EPSTATE_AWAIT_MPAREP; /* * Set MPA Request bits: CRC if required, no MPA Markers, * MPA Rev. according to module parameter 'mpa_version', Key 'Request'. */ cep->mpa.hdr.params.bits = 0; if (version > MPA_REVISION_2) { pr_warn("Setting MPA version to %u\n", MPA_REVISION_2); version = MPA_REVISION_2; /* Adjust also module parameter */ mpa_version = MPA_REVISION_2; } __mpa_rr_set_revision(&cep->mpa.hdr.params.bits, version); if (try_gso) cep->mpa.hdr.params.bits |= MPA_RR_FLAG_GSO_EXP; if (mpa_crc_required) cep->mpa.hdr.params.bits |= MPA_RR_FLAG_CRC; /* * If MPA version == 2: * o Include ORD and IRD. * o Indicate peer-to-peer mode, if required by module * parameter 'peer_to_peer'. */ if (version == MPA_REVISION_2) { cep->enhanced_rdma_conn_est = true; cep->mpa.hdr.params.bits |= MPA_RR_FLAG_ENHANCED; cep->mpa.v2_ctrl.ird = htons(cep->ird); cep->mpa.v2_ctrl.ord = htons(cep->ord); if (p2p_mode) { cep->mpa.v2_ctrl.ird |= MPA_V2_PEER_TO_PEER; cep->mpa.v2_ctrl.ord |= rtr_type; } /* Remember own P2P mode requested */ cep->mpa.v2_ctrl_req.ird = cep->mpa.v2_ctrl.ird; cep->mpa.v2_ctrl_req.ord = cep->mpa.v2_ctrl.ord; } memcpy(cep->mpa.hdr.key, MPA_KEY_REQ, 16); rv = siw_send_mpareqrep(cep, params->private_data, pd_len); /* * Reset private data. */ cep->mpa.hdr.params.pd_len = 0; if (rv >= 0) { rv = siw_cm_queue_work(cep, SIW_CM_WORK_MPATIMEOUT); if (!rv) { siw_dbg_cep(cep, "[QP %u]: exit\n", qp_id(qp)); siw_cep_set_free(cep); return 0; } } error: siw_dbg(id->device, "failed: %d\n", rv); if (cep) { siw_socket_disassoc(s); sock_release(s); cep->sock = NULL; cep->qp = NULL; cep->cm_id = NULL; id->rem_ref(id); qp->cep = NULL; siw_cep_put(cep); cep->state = SIW_EPSTATE_CLOSED; siw_cep_set_free_and_put(cep); } else if (s) { sock_release(s); } if (qp) siw_qp_put(qp); return rv; } /* * siw_accept - Let SoftiWARP accept an RDMA connection request * * @id: New connection management id to be used for accepted * connection request * @params: Connection parameters provided by ULP for accepting connection * * Transition QP to RTS state, associate new CM id @id with accepted CEP * and get prepared for TCP input by installing socket callbacks. * Then send MPA Reply and generate the "connection established" event. * Socket callbacks must be installed before sending MPA Reply, because * the latter may cause a first RDMA message to arrive from the RDMA Initiator * side very quickly, at which time the socket callbacks must be ready. */ int siw_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params) { struct siw_device *sdev = to_siw_dev(id->device); struct siw_cep *cep = (struct siw_cep *)id->provider_data; struct siw_qp *qp; struct siw_qp_attrs qp_attrs; int rv = -EINVAL, max_priv_data = MPA_MAX_PRIVDATA; bool wait_for_peer_rts = false; siw_cep_set_inuse(cep); siw_cep_put(cep); /* Free lingering inbound private data */ if (cep->mpa.hdr.params.pd_len) { cep->mpa.hdr.params.pd_len = 0; kfree(cep->mpa.pdata); cep->mpa.pdata = NULL; } siw_cancel_mpatimer(cep); if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) { siw_dbg_cep(cep, "out of state\n"); rv = -ECONNRESET; goto free_cep; } qp = siw_qp_id2obj(sdev, params->qpn); if (!qp) { WARN(1, "[QP %d] does not exist\n", params->qpn); goto free_cep; } down_write(&qp->state_lock); if (qp->attrs.state > SIW_QP_STATE_RTR) goto error_unlock; siw_dbg_cep(cep, "[QP %d]\n", params->qpn); if (try_gso && cep->mpa.hdr.params.bits & MPA_RR_FLAG_GSO_EXP) { siw_dbg_cep(cep, "peer allows GSO on TX\n"); qp->tx_ctx.gso_seg_limit = 0; } if (params->ord > sdev->attrs.max_ord || params->ird > sdev->attrs.max_ird) { siw_dbg_cep( cep, "[QP %u]: ord %d (max %d), ird %d (max %d)\n", qp_id(qp), params->ord, sdev->attrs.max_ord, params->ird, sdev->attrs.max_ird); goto error_unlock; } if (cep->enhanced_rdma_conn_est) max_priv_data -= sizeof(struct mpa_v2_data); if (params->private_data_len > max_priv_data) { siw_dbg_cep( cep, "[QP %u]: private data length: %d (max %d)\n", qp_id(qp), params->private_data_len, max_priv_data); goto error_unlock; } if (cep->enhanced_rdma_conn_est) { if (params->ord > cep->ord) { if (relaxed_ird_negotiation) { params->ord = cep->ord; } else { cep->ird = params->ird; cep->ord = params->ord; goto error_unlock; } } if (params->ird < cep->ird) { if (relaxed_ird_negotiation && cep->ird <= sdev->attrs.max_ird) params->ird = cep->ird; else { rv = -ENOMEM; goto error_unlock; } } if (cep->mpa.v2_ctrl.ord & (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR)) wait_for_peer_rts = true; /* * Signal back negotiated IRD and ORD values */ cep->mpa.v2_ctrl.ord = htons(params->ord & MPA_IRD_ORD_MASK) | (cep->mpa.v2_ctrl.ord & ~MPA_V2_MASK_IRD_ORD); cep->mpa.v2_ctrl.ird = htons(params->ird & MPA_IRD_ORD_MASK) | (cep->mpa.v2_ctrl.ird & ~MPA_V2_MASK_IRD_ORD); } cep->ird = params->ird; cep->ord = params->ord; cep->cm_id = id; id->add_ref(id); memset(&qp_attrs, 0, sizeof(qp_attrs)); qp_attrs.orq_size = cep->ord; qp_attrs.irq_size = cep->ird; qp_attrs.sk = cep->sock; if (cep->mpa.hdr.params.bits & MPA_RR_FLAG_CRC) qp_attrs.flags = SIW_MPA_CRC; qp_attrs.state = SIW_QP_STATE_RTS; siw_dbg_cep(cep, "[QP%u]: moving to rts\n", qp_id(qp)); /* Associate QP with CEP */ siw_cep_get(cep); qp->cep = cep; /* siw_qp_get(qp) already done by QP lookup */ cep->qp = qp; cep->state = SIW_EPSTATE_RDMA_MODE; /* Move socket RX/TX under QP control */ rv = siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE | SIW_QP_ATTR_LLP_HANDLE | SIW_QP_ATTR_ORD | SIW_QP_ATTR_IRD | SIW_QP_ATTR_MPA); up_write(&qp->state_lock); if (rv) goto error; siw_dbg_cep(cep, "[QP %u]: send mpa reply, %d byte pdata\n", qp_id(qp), params->private_data_len); rv = siw_send_mpareqrep(cep, params->private_data, params->private_data_len); if (rv != 0) goto error; if (wait_for_peer_rts) { siw_sk_assign_rtr_upcalls(cep); } else { siw_qp_socket_assoc(cep, qp); rv = siw_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0); if (rv) goto error; } siw_cep_set_free(cep); return 0; error_unlock: up_write(&qp->state_lock); error: siw_destroy_cep_sock(cep); cep->state = SIW_EPSTATE_CLOSED; siw_free_cm_id(cep); if (qp->cep) { siw_cep_put(cep); qp->cep = NULL; } cep->qp = NULL; siw_qp_put(qp); free_cep: siw_cep_set_free_and_put(cep); return rv; } /* * siw_reject() * * Local connection reject case. Send private data back to peer, * close connection and dereference connection id. */ int siw_reject(struct iw_cm_id *id, const void *pdata, u8 pd_len) { struct siw_cep *cep = (struct siw_cep *)id->provider_data; siw_cep_set_inuse(cep); siw_cep_put(cep); siw_cancel_mpatimer(cep); if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) { siw_dbg_cep(cep, "out of state\n"); siw_cep_set_free_and_put(cep); /* put last reference */ return -ECONNRESET; } siw_dbg_cep(cep, "cep->state %d, pd_len %d\n", cep->state, pd_len); if (__mpa_rr_revision(cep->mpa.hdr.params.bits) >= MPA_REVISION_1) { cep->mpa.hdr.params.bits |= MPA_RR_FLAG_REJECT; /* reject */ siw_send_mpareqrep(cep, pdata, pd_len); } siw_destroy_cep_sock(cep); cep->state = SIW_EPSTATE_CLOSED; siw_cep_set_free_and_put(cep); return 0; } /* * siw_create_listen - Create resources for a listener's IWCM ID @id * * Starts listen on the socket address id->local_addr. * */ int siw_create_listen(struct iw_cm_id *id, int backlog) { struct socket *s; struct siw_cep *cep = NULL; struct net_device *ndev = NULL; struct siw_device *sdev = to_siw_dev(id->device); int addr_family = id->local_addr.ss_family; int rv = 0; if (addr_family != AF_INET && addr_family != AF_INET6) return -EAFNOSUPPORT; rv = sock_create(addr_family, SOCK_STREAM, IPPROTO_TCP, &s); if (rv < 0) return rv; /* * Allow binding local port when still in TIME_WAIT from last close. */ sock_set_reuseaddr(s->sk); if (addr_family == AF_INET) { struct sockaddr_in *laddr = &to_sockaddr_in(id->local_addr); /* For wildcard addr, limit binding to current device only */ if (ipv4_is_zeronet(laddr->sin_addr.s_addr)) { ndev = ib_device_get_netdev(id->device, SIW_PORT); if (ndev) { s->sk->sk_bound_dev_if = ndev->ifindex; } else { rv = -ENODEV; goto error; } } rv = s->ops->bind(s, (struct sockaddr *)laddr, sizeof(struct sockaddr_in)); } else { struct sockaddr_in6 *laddr = &to_sockaddr_in6(id->local_addr); if (id->afonly) { rv = ip6_sock_set_v6only(s->sk); if (rv) { siw_dbg(id->device, "ip6_sock_set_v6only erro: %d\n", rv); goto error; } } /* For wildcard addr, limit binding to current device only */ if (ipv6_addr_any(&laddr->sin6_addr)) { ndev = ib_device_get_netdev(id->device, SIW_PORT); if (ndev) { s->sk->sk_bound_dev_if = ndev->ifindex; } else { rv = -ENODEV; goto error; } } rv = s->ops->bind(s, (struct sockaddr *)laddr, sizeof(struct sockaddr_in6)); } if (rv) { siw_dbg(id->device, "socket bind error: %d\n", rv); goto error; } cep = siw_cep_alloc(sdev); if (!cep) { rv = -ENOMEM; goto error; } siw_cep_socket_assoc(cep, s); rv = siw_cm_alloc_work(cep, backlog); if (rv) { siw_dbg(id->device, "alloc_work error %d, backlog %d\n", rv, backlog); goto error; } rv = s->ops->listen(s, backlog); if (rv) { siw_dbg(id->device, "listen error %d\n", rv); goto error; } cep->cm_id = id; id->add_ref(id); /* * In case of a wildcard rdma_listen on a multi-homed device, * a listener's IWCM id is associated with more than one listening CEP. * * We currently use id->provider_data in three different ways: * * o For a listener's IWCM id, id->provider_data points to * the list_head of the list of listening CEPs. * Uses: siw_create_listen(), siw_destroy_listen() * * o For each accepted passive-side IWCM id, id->provider_data * points to the CEP itself. This is a consequence of * - siw_cm_upcall() setting event.provider_data = cep and * - the IWCM's cm_conn_req_handler() setting provider_data of the * new passive-side IWCM id equal to event.provider_data * Uses: siw_accept(), siw_reject() * * o For an active-side IWCM id, id->provider_data is not used at all. * */ if (!id->provider_data) { id->provider_data = kmalloc(sizeof(struct list_head), GFP_KERNEL); if (!id->provider_data) { rv = -ENOMEM; goto error; } INIT_LIST_HEAD((struct list_head *)id->provider_data); } list_add_tail(&cep->listenq, (struct list_head *)id->provider_data); cep->state = SIW_EPSTATE_LISTENING; dev_put(ndev); siw_dbg(id->device, "Listen at laddr %pISp\n", &id->local_addr); return 0; error: siw_dbg(id->device, "failed: %d\n", rv); if (cep) { siw_cep_set_inuse(cep); siw_free_cm_id(cep); cep->sock = NULL; siw_socket_disassoc(s); cep->state = SIW_EPSTATE_CLOSED; siw_cep_set_free_and_put(cep); } sock_release(s); dev_put(ndev); return rv; } static void siw_drop_listeners(struct iw_cm_id *id) { struct list_head *p, *tmp; /* * In case of a wildcard rdma_listen on a multi-homed device, * a listener's IWCM id is associated with more than one listening CEP. */ list_for_each_safe(p, tmp, (struct list_head *)id->provider_data) { struct siw_cep *cep = list_entry(p, struct siw_cep, listenq); list_del(p); siw_dbg_cep(cep, "drop cep, state %d\n", cep->state); siw_cep_set_inuse(cep); siw_free_cm_id(cep); if (cep->sock) { siw_socket_disassoc(cep->sock); sock_release(cep->sock); cep->sock = NULL; } cep->state = SIW_EPSTATE_CLOSED; siw_cep_set_free_and_put(cep); } } int siw_destroy_listen(struct iw_cm_id *id) { if (!id->provider_data) { siw_dbg(id->device, "no cep(s)\n"); return 0; } siw_drop_listeners(id); kfree(id->provider_data); id->provider_data = NULL; return 0; } int siw_cm_init(void) { /* * create_single_workqueue for strict ordering */ siw_cm_wq = create_singlethread_workqueue("siw_cm_wq"); if (!siw_cm_wq) return -ENOMEM; return 0; } void siw_cm_exit(void) { if (siw_cm_wq) destroy_workqueue(siw_cm_wq); }
2188 2187 2188 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 // SPDX-License-Identifier: GPL-2.0-only /* * CAIF USB handler * Copyright (C) ST-Ericsson AB 2011 * Author: Sjur Brendeland */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/module.h> #include <linux/netdevice.h> #include <linux/slab.h> #include <linux/mii.h> #include <linux/usb.h> #include <linux/usb/usbnet.h> #include <linux/etherdevice.h> #include <net/netns/generic.h> #include <net/caif/caif_dev.h> #include <net/caif/caif_layer.h> #include <net/caif/cfpkt.h> #include <net/caif/cfcnfg.h> MODULE_DESCRIPTION("ST-Ericsson CAIF modem protocol USB support"); MODULE_LICENSE("GPL"); #define CFUSB_PAD_DESCR_SZ 1 /* Alignment descriptor length */ #define CFUSB_ALIGNMENT 4 /* Number of bytes to align. */ #define CFUSB_MAX_HEADLEN (CFUSB_PAD_DESCR_SZ + CFUSB_ALIGNMENT-1) #define STE_USB_VID 0x04cc /* USB Product ID for ST-Ericsson */ #define STE_USB_PID_CAIF 0x230f /* Product id for CAIF Modems */ struct cfusbl { struct cflayer layer; u8 tx_eth_hdr[ETH_HLEN]; }; static bool pack_added; static int cfusbl_receive(struct cflayer *layr, struct cfpkt *pkt) { u8 hpad; /* Remove padding. */ cfpkt_extr_head(pkt, &hpad, 1); cfpkt_extr_head(pkt, NULL, hpad); return layr->up->receive(layr->up, pkt); } static int cfusbl_transmit(struct cflayer *layr, struct cfpkt *pkt) { struct caif_payload_info *info; u8 hpad; u8 zeros[CFUSB_ALIGNMENT]; struct sk_buff *skb; struct cfusbl *usbl = container_of(layr, struct cfusbl, layer); skb = cfpkt_tonative(pkt); skb_reset_network_header(skb); skb->protocol = htons(ETH_P_IP); info = cfpkt_info(pkt); hpad = (info->hdr_len + CFUSB_PAD_DESCR_SZ) & (CFUSB_ALIGNMENT - 1); if (skb_headroom(skb) < ETH_HLEN + CFUSB_PAD_DESCR_SZ + hpad) { pr_warn("Headroom too small\n"); kfree_skb(skb); return -EIO; } memset(zeros, 0, hpad); cfpkt_add_head(pkt, zeros, hpad); cfpkt_add_head(pkt, &hpad, 1); cfpkt_add_head(pkt, usbl->tx_eth_hdr, sizeof(usbl->tx_eth_hdr)); return layr->dn->transmit(layr->dn, pkt); } static void cfusbl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, int phyid) { if (layr->up && layr->up->ctrlcmd) layr->up->ctrlcmd(layr->up, ctrl, layr->id); } static struct cflayer *cfusbl_create(int phyid, const u8 ethaddr[ETH_ALEN], u8 braddr[ETH_ALEN]) { struct cfusbl *this = kmalloc(sizeof(struct cfusbl), GFP_ATOMIC); if (!this) return NULL; caif_assert(offsetof(struct cfusbl, layer) == 0); memset(&this->layer, 0, sizeof(this->layer)); this->layer.receive = cfusbl_receive; this->layer.transmit = cfusbl_transmit; this->layer.ctrlcmd = cfusbl_ctrlcmd; snprintf(this->layer.name, CAIF_LAYER_NAME_SZ, "usb%d", phyid); this->layer.id = phyid; /* * Construct TX ethernet header: * 0-5 destination address * 5-11 source address * 12-13 protocol type */ ether_addr_copy(&this->tx_eth_hdr[ETH_ALEN], braddr); ether_addr_copy(&this->tx_eth_hdr[ETH_ALEN], ethaddr); this->tx_eth_hdr[12] = cpu_to_be16(ETH_P_802_EX1) & 0xff; this->tx_eth_hdr[13] = (cpu_to_be16(ETH_P_802_EX1) >> 8) & 0xff; pr_debug("caif ethernet TX-header dst:%pM src:%pM type:%02x%02x\n", this->tx_eth_hdr, this->tx_eth_hdr + ETH_ALEN, this->tx_eth_hdr[12], this->tx_eth_hdr[13]); return (struct cflayer *) this; } static void cfusbl_release(struct cflayer *layer) { kfree(layer); } static struct packet_type caif_usb_type __read_mostly = { .type = cpu_to_be16(ETH_P_802_EX1), }; static int cfusbl_device_notify(struct notifier_block *me, unsigned long what, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct caif_dev_common common; struct cflayer *layer, *link_support; struct usbnet *usbnet; struct usb_device *usbdev; int res; if (what == NETDEV_UNREGISTER && dev->reg_state >= NETREG_UNREGISTERED) return 0; /* Check whether we have a NCM device, and find its VID/PID. */ if (!(dev->dev.parent && dev->dev.parent->driver && strcmp(dev->dev.parent->driver->name, "cdc_ncm") == 0)) return 0; usbnet = netdev_priv(dev); usbdev = usbnet->udev; pr_debug("USB CDC NCM device VID:0x%4x PID:0x%4x\n", le16_to_cpu(usbdev->descriptor.idVendor), le16_to_cpu(usbdev->descriptor.idProduct)); /* Check for VID/PID that supports CAIF */ if (!(le16_to_cpu(usbdev->descriptor.idVendor) == STE_USB_VID && le16_to_cpu(usbdev->descriptor.idProduct) == STE_USB_PID_CAIF)) return 0; if (what == NETDEV_UNREGISTER) module_put(THIS_MODULE); if (what != NETDEV_REGISTER) return 0; __module_get(THIS_MODULE); memset(&common, 0, sizeof(common)); common.use_frag = false; common.use_fcs = false; common.use_stx = false; common.link_select = CAIF_LINK_HIGH_BANDW; common.flowctrl = NULL; link_support = cfusbl_create(dev->ifindex, dev->dev_addr, dev->broadcast); if (!link_support) return -ENOMEM; if (dev->num_tx_queues > 1) pr_warn("USB device uses more than one tx queue\n"); res = caif_enroll_dev(dev, &common, link_support, CFUSB_MAX_HEADLEN, &layer, &caif_usb_type.func); if (res) goto err; if (!pack_added) dev_add_pack(&caif_usb_type); pack_added = true; strscpy(layer->name, dev->name, sizeof(layer->name)); return 0; err: cfusbl_release(link_support); return res; } static struct notifier_block caif_device_notifier = { .notifier_call = cfusbl_device_notify, .priority = 0, }; static int __init cfusbl_init(void) { return register_netdevice_notifier(&caif_device_notifier); } static void __exit cfusbl_exit(void) { unregister_netdevice_notifier(&caif_device_notifier); dev_remove_pack(&caif_usb_type); } module_init(cfusbl_init); module_exit(cfusbl_exit);
4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PATH_H #define _LINUX_PATH_H struct dentry; struct vfsmount; struct path { struct vfsmount *mnt; struct dentry *dentry; } __randomize_layout; extern void path_get(const struct path *); extern void path_put(const struct path *); static inline int path_equal(const struct path *path1, const struct path *path2) { return path1->mnt == path2->mnt && path1->dentry == path2->dentry; } /* * Cleanup macro for use with __free(path_put). Avoids dereference and * copying @path unlike DEFINE_FREE(). path_put() will handle the empty * path correctly just ensure @path is initialized: * * struct path path __free(path_put) = {}; */ #define __free_path_put path_put #endif /* _LINUX_PATH_H */
2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 // SPDX-License-Identifier: GPL-2.0-only /* * File: pep-gprs.c * * GPRS over Phonet pipe end point socket * * Copyright (C) 2008 Nokia Corporation. * * Author: Rémi Denis-Courmont */ #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/if_ether.h> #include <linux/if_arp.h> #include <net/sock.h> #include <linux/if_phonet.h> #include <net/tcp_states.h> #include <net/phonet/gprs.h> #include <trace/events/sock.h> #define GPRS_DEFAULT_MTU 1400 struct gprs_dev { struct sock *sk; void (*old_state_change)(struct sock *); void (*old_data_ready)(struct sock *); void (*old_write_space)(struct sock *); struct net_device *dev; }; static __be16 gprs_type_trans(struct sk_buff *skb) { const u8 *pvfc; u8 buf; pvfc = skb_header_pointer(skb, 0, 1, &buf); if (!pvfc) return htons(0); /* Look at IP version field */ switch (*pvfc >> 4) { case 4: return htons(ETH_P_IP); case 6: return htons(ETH_P_IPV6); } return htons(0); } static void gprs_writeable(struct gprs_dev *gp) { struct net_device *dev = gp->dev; if (pep_writeable(gp->sk)) netif_wake_queue(dev); } /* * Socket callbacks */ static void gprs_state_change(struct sock *sk) { struct gprs_dev *gp = sk->sk_user_data; if (sk->sk_state == TCP_CLOSE_WAIT) { struct net_device *dev = gp->dev; netif_stop_queue(dev); netif_carrier_off(dev); } } static int gprs_recv(struct gprs_dev *gp, struct sk_buff *skb) { struct net_device *dev = gp->dev; int err = 0; __be16 protocol = gprs_type_trans(skb); if (!protocol) { err = -EINVAL; goto drop; } if (skb_headroom(skb) & 3) { struct sk_buff *rskb, *fs; int flen = 0; /* Phonet Pipe data header may be misaligned (3 bytes), * so wrap the IP packet as a single fragment of an head-less * socket buffer. The network stack will pull what it needs, * but at least, the whole IP payload is not memcpy'd. */ rskb = netdev_alloc_skb(dev, 0); if (!rskb) { err = -ENOBUFS; goto drop; } skb_shinfo(rskb)->frag_list = skb; rskb->len += skb->len; rskb->data_len += rskb->len; rskb->truesize += rskb->len; /* Avoid nested fragments */ skb_walk_frags(skb, fs) flen += fs->len; skb->next = skb_shinfo(skb)->frag_list; skb_frag_list_init(skb); skb->len -= flen; skb->data_len -= flen; skb->truesize -= flen; skb = rskb; } skb->protocol = protocol; skb_reset_mac_header(skb); skb->dev = dev; if (likely(dev->flags & IFF_UP)) { dev->stats.rx_packets++; dev->stats.rx_bytes += skb->len; netif_rx(skb); skb = NULL; } else err = -ENODEV; drop: if (skb) { dev_kfree_skb(skb); dev->stats.rx_dropped++; } return err; } static void gprs_data_ready(struct sock *sk) { struct gprs_dev *gp = sk->sk_user_data; struct sk_buff *skb; trace_sk_data_ready(sk); while ((skb = pep_read(sk)) != NULL) { skb_orphan(skb); gprs_recv(gp, skb); } } static void gprs_write_space(struct sock *sk) { struct gprs_dev *gp = sk->sk_user_data; if (netif_running(gp->dev)) gprs_writeable(gp); } /* * Network device callbacks */ static int gprs_open(struct net_device *dev) { struct gprs_dev *gp = netdev_priv(dev); gprs_writeable(gp); return 0; } static int gprs_close(struct net_device *dev) { netif_stop_queue(dev); return 0; } static netdev_tx_t gprs_xmit(struct sk_buff *skb, struct net_device *dev) { struct gprs_dev *gp = netdev_priv(dev); struct sock *sk = gp->sk; int len, err; switch (skb->protocol) { case htons(ETH_P_IP): case htons(ETH_P_IPV6): break; default: dev_kfree_skb(skb); return NETDEV_TX_OK; } skb_orphan(skb); skb_set_owner_w(skb, sk); len = skb->len; err = pep_write(sk, skb); if (err) { net_dbg_ratelimited("%s: TX error (%d)\n", dev->name, err); dev->stats.tx_aborted_errors++; dev->stats.tx_errors++; } else { dev->stats.tx_packets++; dev->stats.tx_bytes += len; } netif_stop_queue(dev); if (pep_writeable(sk)) netif_wake_queue(dev); return NETDEV_TX_OK; } static const struct net_device_ops gprs_netdev_ops = { .ndo_open = gprs_open, .ndo_stop = gprs_close, .ndo_start_xmit = gprs_xmit, }; static void gprs_setup(struct net_device *dev) { dev->features = NETIF_F_FRAGLIST; dev->type = ARPHRD_PHONET_PIPE; dev->flags = IFF_POINTOPOINT | IFF_NOARP; dev->mtu = GPRS_DEFAULT_MTU; dev->min_mtu = 576; dev->max_mtu = (PHONET_MAX_MTU - 11); dev->hard_header_len = 0; dev->addr_len = 0; dev->tx_queue_len = 10; dev->netdev_ops = &gprs_netdev_ops; dev->needs_free_netdev = true; } /* * External interface */ /* * Attach a GPRS interface to a datagram socket. * Returns the interface index on success, negative error code on error. */ int gprs_attach(struct sock *sk) { static const char ifname[] = "gprs%d"; struct gprs_dev *gp; struct net_device *dev; int err; if (unlikely(sk->sk_type == SOCK_STREAM)) return -EINVAL; /* need packet boundaries */ /* Create net device */ dev = alloc_netdev(sizeof(*gp), ifname, NET_NAME_UNKNOWN, gprs_setup); if (!dev) return -ENOMEM; gp = netdev_priv(dev); gp->sk = sk; gp->dev = dev; netif_stop_queue(dev); err = register_netdev(dev); if (err) { free_netdev(dev); return err; } lock_sock(sk); if (unlikely(sk->sk_user_data)) { err = -EBUSY; goto out_rel; } if (unlikely((1 << sk->sk_state & (TCPF_CLOSE|TCPF_LISTEN)) || sock_flag(sk, SOCK_DEAD))) { err = -EINVAL; goto out_rel; } sk->sk_user_data = gp; gp->old_state_change = sk->sk_state_change; gp->old_data_ready = sk->sk_data_ready; gp->old_write_space = sk->sk_write_space; sk->sk_state_change = gprs_state_change; sk->sk_data_ready = gprs_data_ready; sk->sk_write_space = gprs_write_space; release_sock(sk); sock_hold(sk); printk(KERN_DEBUG"%s: attached\n", dev->name); return dev->ifindex; out_rel: release_sock(sk); unregister_netdev(dev); return err; } void gprs_detach(struct sock *sk) { struct gprs_dev *gp = sk->sk_user_data; struct net_device *dev = gp->dev; lock_sock(sk); sk->sk_user_data = NULL; sk->sk_state_change = gp->old_state_change; sk->sk_data_ready = gp->old_data_ready; sk->sk_write_space = gp->old_write_space; release_sock(sk); printk(KERN_DEBUG"%s: detached\n", dev->name); unregister_netdev(dev); sock_put(sk); }
1 49 13 1 7 28 50 50 47 3 49 49 49 49 37 2 1 37 3 18 17 49 784 16 780 780 780 29 778 2 114 114 1 4 1 1 103 5 93 3 12 1 50 40 26 1 26 1 17 9 37 6 3 12 14 4 18 38 38 4 61 63 2 61 3 3 3 3 3 3 15 10 5 5 5 5 3 3 2 1 1 1 6 7 39 39 21 11 25 2 34 34 6 34 9 14 7 5 6 5 2 2 2 2 1 1 86 108 90 10 8 18 97 97 92 5 81 16 77 5 15 90 6 79 10 8 1 7 3 88 9 81 16 87 10 97 97 90 7 85 11 11 89 7 8 8 205 101 14 9 120 4 18 8 4 3 26 6 6 13 1 4 6 9 19 4 10 6 4 7 13 3 7 4 23 4 4 4 14 3 6 9 7 2 5 17 5 4 3 5 5 4 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 // SPDX-License-Identifier: GPL-2.0-or-later /* * common UDP/RAW code * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> */ #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/interrupt.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/in6.h> #include <linux/ipv6.h> #include <linux/route.h> #include <linux/slab.h> #include <linux/export.h> #include <linux/icmp.h> #include <net/ipv6.h> #include <net/ndisc.h> #include <net/addrconf.h> #include <net/transp_v6.h> #include <net/ip6_route.h> #include <net/tcp_states.h> #include <net/dsfield.h> #include <net/sock_reuseport.h> #include <linux/errqueue.h> #include <linux/uaccess.h> static bool ipv6_mapped_addr_any(const struct in6_addr *a) { return ipv6_addr_v4mapped(a) && (a->s6_addr32[3] == 0); } static void ip6_datagram_flow_key_init(struct flowi6 *fl6, const struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); const struct ipv6_pinfo *np = inet6_sk(sk); int oif = sk->sk_bound_dev_if; memset(fl6, 0, sizeof(*fl6)); fl6->flowi6_proto = sk->sk_protocol; fl6->daddr = sk->sk_v6_daddr; fl6->saddr = np->saddr; fl6->flowi6_mark = sk->sk_mark; fl6->fl6_dport = inet->inet_dport; fl6->fl6_sport = inet->inet_sport; fl6->flowlabel = ip6_make_flowinfo(np->tclass, np->flow_label); fl6->flowi6_uid = sk->sk_uid; if (!oif) oif = np->sticky_pktinfo.ipi6_ifindex; if (!oif) { if (ipv6_addr_is_multicast(&fl6->daddr)) oif = READ_ONCE(np->mcast_oif); else oif = READ_ONCE(np->ucast_oif); } fl6->flowi6_oif = oif; security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6)); } int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr) { struct ip6_flowlabel *flowlabel = NULL; struct in6_addr *final_p, final; struct ipv6_txoptions *opt; struct dst_entry *dst; struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct flowi6 fl6; int err = 0; if (inet6_test_bit(SNDFLOW, sk) && (np->flow_label & IPV6_FLOWLABEL_MASK)) { flowlabel = fl6_sock_lookup(sk, np->flow_label); if (IS_ERR(flowlabel)) return -EINVAL; } ip6_datagram_flow_key_init(&fl6, sk); rcu_read_lock(); opt = flowlabel ? flowlabel->opt : rcu_dereference(np->opt); final_p = fl6_update_dst(&fl6, opt, &final); rcu_read_unlock(); dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto out; } if (fix_sk_saddr) { if (ipv6_addr_any(&np->saddr)) np->saddr = fl6.saddr; if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) { sk->sk_v6_rcv_saddr = fl6.saddr; inet->inet_rcv_saddr = LOOPBACK4_IPV6; if (sk->sk_prot->rehash) sk->sk_prot->rehash(sk); } } ip6_sk_dst_store_flow(sk, dst, &fl6); out: fl6_sock_release(flowlabel); return err; } void ip6_datagram_release_cb(struct sock *sk) { struct dst_entry *dst; if (ipv6_addr_v4mapped(&sk->sk_v6_daddr)) return; rcu_read_lock(); dst = __sk_dst_get(sk); if (!dst || !dst->obsolete || dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) { rcu_read_unlock(); return; } rcu_read_unlock(); ip6_datagram_dst_update(sk, false); } EXPORT_SYMBOL_GPL(ip6_datagram_release_cb); int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct in6_addr *daddr, old_daddr; __be32 fl6_flowlabel = 0; __be32 old_fl6_flowlabel; __be16 old_dport; int addr_type; int err; if (usin->sin6_family == AF_INET) { if (ipv6_only_sock(sk)) return -EAFNOSUPPORT; err = __ip4_datagram_connect(sk, uaddr, addr_len); goto ipv4_connected; } if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; if (usin->sin6_family != AF_INET6) return -EAFNOSUPPORT; if (inet6_test_bit(SNDFLOW, sk)) fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK; if (ipv6_addr_any(&usin->sin6_addr)) { /* * connect to self */ if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK), &usin->sin6_addr); else usin->sin6_addr = in6addr_loopback; } addr_type = ipv6_addr_type(&usin->sin6_addr); daddr = &usin->sin6_addr; if (addr_type & IPV6_ADDR_MAPPED) { struct sockaddr_in sin; if (ipv6_only_sock(sk)) { err = -ENETUNREACH; goto out; } sin.sin_family = AF_INET; sin.sin_addr.s_addr = daddr->s6_addr32[3]; sin.sin_port = usin->sin6_port; err = __ip4_datagram_connect(sk, (struct sockaddr *) &sin, sizeof(sin)); ipv4_connected: if (err) goto out; ipv6_addr_set_v4mapped(inet->inet_daddr, &sk->sk_v6_daddr); if (ipv6_addr_any(&np->saddr) || ipv6_mapped_addr_any(&np->saddr)) ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr); if (ipv6_addr_any(&sk->sk_v6_rcv_saddr) || ipv6_mapped_addr_any(&sk->sk_v6_rcv_saddr)) { ipv6_addr_set_v4mapped(inet->inet_rcv_saddr, &sk->sk_v6_rcv_saddr); if (sk->sk_prot->rehash) sk->sk_prot->rehash(sk); } goto out; } if (__ipv6_addr_needs_scope_id(addr_type)) { if (addr_len >= sizeof(struct sockaddr_in6) && usin->sin6_scope_id) { if (!sk_dev_equal_l3scope(sk, usin->sin6_scope_id)) { err = -EINVAL; goto out; } WRITE_ONCE(sk->sk_bound_dev_if, usin->sin6_scope_id); } if (!sk->sk_bound_dev_if && (addr_type & IPV6_ADDR_MULTICAST)) WRITE_ONCE(sk->sk_bound_dev_if, READ_ONCE(np->mcast_oif)); /* Connect to link-local address requires an interface */ if (!sk->sk_bound_dev_if) { err = -EINVAL; goto out; } } /* save the current peer information before updating it */ old_daddr = sk->sk_v6_daddr; old_fl6_flowlabel = np->flow_label; old_dport = inet->inet_dport; sk->sk_v6_daddr = *daddr; np->flow_label = fl6_flowlabel; inet->inet_dport = usin->sin6_port; /* * Check for a route to destination an obtain the * destination cache for it. */ err = ip6_datagram_dst_update(sk, true); if (err) { /* Restore the socket peer info, to keep it consistent with * the old socket state */ sk->sk_v6_daddr = old_daddr; np->flow_label = old_fl6_flowlabel; inet->inet_dport = old_dport; goto out; } reuseport_has_conns_set(sk); sk->sk_state = TCP_ESTABLISHED; sk_set_txhash(sk); out: return err; } EXPORT_SYMBOL_GPL(__ip6_datagram_connect); int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { int res; lock_sock(sk); res = __ip6_datagram_connect(sk, uaddr, addr_len); release_sock(sk); return res; } EXPORT_SYMBOL_GPL(ip6_datagram_connect); int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *uaddr, int addr_len) { DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, uaddr); if (sin6->sin6_family != AF_INET6) return -EAFNOSUPPORT; return ip6_datagram_connect(sk, uaddr, addr_len); } EXPORT_SYMBOL_GPL(ip6_datagram_connect_v6_only); static void ipv6_icmp_error_rfc4884(const struct sk_buff *skb, struct sock_ee_data_rfc4884 *out) { switch (icmp6_hdr(skb)->icmp6_type) { case ICMPV6_TIME_EXCEED: case ICMPV6_DEST_UNREACH: ip_icmp_error_rfc4884(skb, out, sizeof(struct icmp6hdr), icmp6_hdr(skb)->icmp6_datagram_len * 8); } } void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload) { struct icmp6hdr *icmph = icmp6_hdr(skb); struct sock_exterr_skb *serr; if (!inet6_test_bit(RECVERR6, sk)) return; skb = skb_clone(skb, GFP_ATOMIC); if (!skb) return; skb->protocol = htons(ETH_P_IPV6); serr = SKB_EXT_ERR(skb); serr->ee.ee_errno = err; serr->ee.ee_origin = SO_EE_ORIGIN_ICMP6; serr->ee.ee_type = icmph->icmp6_type; serr->ee.ee_code = icmph->icmp6_code; serr->ee.ee_pad = 0; serr->ee.ee_info = info; serr->ee.ee_data = 0; serr->addr_offset = (u8 *)&(((struct ipv6hdr *)(icmph + 1))->daddr) - skb_network_header(skb); serr->port = port; __skb_pull(skb, payload - skb->data); if (inet6_test_bit(RECVERR6_RFC4884, sk)) ipv6_icmp_error_rfc4884(skb, &serr->ee.ee_rfc4884); skb_reset_transport_header(skb); if (sock_queue_err_skb(sk, skb)) kfree_skb(skb); } EXPORT_SYMBOL_GPL(ipv6_icmp_error); void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info) { struct sock_exterr_skb *serr; struct ipv6hdr *iph; struct sk_buff *skb; if (!inet6_test_bit(RECVERR6, sk)) return; skb = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC); if (!skb) return; skb->protocol = htons(ETH_P_IPV6); skb_put(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); iph = ipv6_hdr(skb); iph->daddr = fl6->daddr; ip6_flow_hdr(iph, 0, 0); serr = SKB_EXT_ERR(skb); serr->ee.ee_errno = err; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_type = 0; serr->ee.ee_code = 0; serr->ee.ee_pad = 0; serr->ee.ee_info = info; serr->ee.ee_data = 0; serr->addr_offset = (u8 *)&iph->daddr - skb_network_header(skb); serr->port = fl6->fl6_dport; __skb_pull(skb, skb_tail_pointer(skb) - skb->data); skb_reset_transport_header(skb); if (sock_queue_err_skb(sk, skb)) kfree_skb(skb); } void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6hdr *iph; struct sk_buff *skb; struct ip6_mtuinfo *mtu_info; if (!np->rxopt.bits.rxpmtu) return; skb = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC); if (!skb) return; skb_put(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); iph = ipv6_hdr(skb); iph->daddr = fl6->daddr; mtu_info = IP6CBMTU(skb); mtu_info->ip6m_mtu = mtu; mtu_info->ip6m_addr.sin6_family = AF_INET6; mtu_info->ip6m_addr.sin6_port = 0; mtu_info->ip6m_addr.sin6_flowinfo = 0; mtu_info->ip6m_addr.sin6_scope_id = fl6->flowi6_oif; mtu_info->ip6m_addr.sin6_addr = ipv6_hdr(skb)->daddr; __skb_pull(skb, skb_tail_pointer(skb) - skb->data); skb_reset_transport_header(skb); skb = xchg(&np->rxpmtu, skb); kfree_skb(skb); } /* For some errors we have valid addr_offset even with zero payload and * zero port. Also, addr_offset should be supported if port is set. */ static inline bool ipv6_datagram_support_addr(struct sock_exterr_skb *serr) { return serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6 || serr->ee.ee_origin == SO_EE_ORIGIN_ICMP || serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL || serr->port; } /* IPv6 supports cmsg on all origins aside from SO_EE_ORIGIN_LOCAL. * * At one point, excluding local errors was a quick test to identify icmp/icmp6 * errors. This is no longer true, but the test remained, so the v6 stack, * unlike v4, also honors cmsg requests on all wifi and timestamp errors. */ static bool ip6_datagram_support_cmsg(struct sk_buff *skb, struct sock_exterr_skb *serr) { if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP || serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6) return true; if (serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL) return false; if (!IP6CB(skb)->iif) return false; return true; } /* * Handle MSG_ERRQUEUE */ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) { struct ipv6_pinfo *np = inet6_sk(sk); struct sock_exterr_skb *serr; struct sk_buff *skb; DECLARE_SOCKADDR(struct sockaddr_in6 *, sin, msg->msg_name); struct { struct sock_extended_err ee; struct sockaddr_in6 offender; } errhdr; int err; int copied; err = -EAGAIN; skb = sock_dequeue_err_skb(sk); if (!skb) goto out; copied = skb->len; if (copied > len) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (unlikely(err)) { kfree_skb(skb); return err; } sock_recv_timestamp(msg, sk, skb); serr = SKB_EXT_ERR(skb); if (sin && ipv6_datagram_support_addr(serr)) { const unsigned char *nh = skb_network_header(skb); sin->sin6_family = AF_INET6; sin->sin6_flowinfo = 0; sin->sin6_port = serr->port; if (skb->protocol == htons(ETH_P_IPV6)) { const struct ipv6hdr *ip6h = container_of((struct in6_addr *)(nh + serr->addr_offset), struct ipv6hdr, daddr); sin->sin6_addr = ip6h->daddr; if (inet6_test_bit(SNDFLOW, sk)) sin->sin6_flowinfo = ip6_flowinfo(ip6h); sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, IP6CB(skb)->iif); } else { ipv6_addr_set_v4mapped(*(__be32 *)(nh + serr->addr_offset), &sin->sin6_addr); sin->sin6_scope_id = 0; } *addr_len = sizeof(*sin); } memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); sin = &errhdr.offender; memset(sin, 0, sizeof(*sin)); if (ip6_datagram_support_cmsg(skb, serr)) { sin->sin6_family = AF_INET6; if (np->rxopt.all) ip6_datagram_recv_common_ctl(sk, msg, skb); if (skb->protocol == htons(ETH_P_IPV6)) { sin->sin6_addr = ipv6_hdr(skb)->saddr; if (np->rxopt.all) ip6_datagram_recv_specific_ctl(sk, msg, skb); sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, IP6CB(skb)->iif); } else { ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr, &sin->sin6_addr); if (inet_cmsg_flags(inet_sk(sk))) ip_cmsg_recv(msg, skb); } } put_cmsg(msg, SOL_IPV6, IPV6_RECVERR, sizeof(errhdr), &errhdr); /* Now we could try to dump offended packet options */ msg->msg_flags |= MSG_ERRQUEUE; err = copied; consume_skb(skb); out: return err; } EXPORT_SYMBOL_GPL(ipv6_recv_error); /* * Handle IPV6_RECVPATHMTU */ int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len, int *addr_len) { struct ipv6_pinfo *np = inet6_sk(sk); struct sk_buff *skb; struct ip6_mtuinfo mtu_info; DECLARE_SOCKADDR(struct sockaddr_in6 *, sin, msg->msg_name); int err; int copied; err = -EAGAIN; skb = xchg(&np->rxpmtu, NULL); if (!skb) goto out; copied = skb->len; if (copied > len) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free_skb; sock_recv_timestamp(msg, sk, skb); memcpy(&mtu_info, IP6CBMTU(skb), sizeof(mtu_info)); if (sin) { sin->sin6_family = AF_INET6; sin->sin6_flowinfo = 0; sin->sin6_port = 0; sin->sin6_scope_id = mtu_info.ip6m_addr.sin6_scope_id; sin->sin6_addr = mtu_info.ip6m_addr.sin6_addr; *addr_len = sizeof(*sin); } put_cmsg(msg, SOL_IPV6, IPV6_PATHMTU, sizeof(mtu_info), &mtu_info); err = copied; out_free_skb: kfree_skb(skb); out: return err; } void ip6_datagram_recv_common_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) { struct ipv6_pinfo *np = inet6_sk(sk); bool is_ipv6 = skb->protocol == htons(ETH_P_IPV6); if (np->rxopt.bits.rxinfo) { struct in6_pktinfo src_info; if (is_ipv6) { src_info.ipi6_ifindex = IP6CB(skb)->iif; src_info.ipi6_addr = ipv6_hdr(skb)->daddr; } else { src_info.ipi6_ifindex = PKTINFO_SKB_CB(skb)->ipi_ifindex; ipv6_addr_set_v4mapped(ip_hdr(skb)->daddr, &src_info.ipi6_addr); } if (src_info.ipi6_ifindex >= 0) put_cmsg(msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info); } } void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) { struct ipv6_pinfo *np = inet6_sk(sk); struct inet6_skb_parm *opt = IP6CB(skb); unsigned char *nh = skb_network_header(skb); if (np->rxopt.bits.rxhlim) { int hlim = ipv6_hdr(skb)->hop_limit; put_cmsg(msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim); } if (np->rxopt.bits.rxtclass) { int tclass = ipv6_get_dsfield(ipv6_hdr(skb)); put_cmsg(msg, SOL_IPV6, IPV6_TCLASS, sizeof(tclass), &tclass); } if (np->rxopt.bits.rxflow) { __be32 flowinfo = ip6_flowinfo((struct ipv6hdr *)nh); if (flowinfo) put_cmsg(msg, SOL_IPV6, IPV6_FLOWINFO, sizeof(flowinfo), &flowinfo); } /* HbH is allowed only once */ if (np->rxopt.bits.hopopts && (opt->flags & IP6SKB_HOPBYHOP)) { u8 *ptr = nh + sizeof(struct ipv6hdr); put_cmsg(msg, SOL_IPV6, IPV6_HOPOPTS, (ptr[1]+1)<<3, ptr); } if (opt->lastopt && (np->rxopt.bits.dstopts || np->rxopt.bits.srcrt)) { /* * Silly enough, but we need to reparse in order to * report extension headers (except for HbH) * in order. * * Also note that IPV6_RECVRTHDRDSTOPTS is NOT * (and WILL NOT be) defined because * IPV6_RECVDSTOPTS is more generic. --yoshfuji */ unsigned int off = sizeof(struct ipv6hdr); u8 nexthdr = ipv6_hdr(skb)->nexthdr; while (off <= opt->lastopt) { unsigned int len; u8 *ptr = nh + off; switch (nexthdr) { case IPPROTO_DSTOPTS: nexthdr = ptr[0]; len = (ptr[1] + 1) << 3; if (np->rxopt.bits.dstopts) put_cmsg(msg, SOL_IPV6, IPV6_DSTOPTS, len, ptr); break; case IPPROTO_ROUTING: nexthdr = ptr[0]; len = (ptr[1] + 1) << 3; if (np->rxopt.bits.srcrt) put_cmsg(msg, SOL_IPV6, IPV6_RTHDR, len, ptr); break; case IPPROTO_AH: nexthdr = ptr[0]; len = (ptr[1] + 2) << 2; break; default: nexthdr = ptr[0]; len = (ptr[1] + 1) << 3; break; } off += len; } } /* socket options in old style */ if (np->rxopt.bits.rxoinfo) { struct in6_pktinfo src_info; src_info.ipi6_ifindex = opt->iif; src_info.ipi6_addr = ipv6_hdr(skb)->daddr; put_cmsg(msg, SOL_IPV6, IPV6_2292PKTINFO, sizeof(src_info), &src_info); } if (np->rxopt.bits.rxohlim) { int hlim = ipv6_hdr(skb)->hop_limit; put_cmsg(msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim); } if (np->rxopt.bits.ohopopts && (opt->flags & IP6SKB_HOPBYHOP)) { u8 *ptr = nh + sizeof(struct ipv6hdr); put_cmsg(msg, SOL_IPV6, IPV6_2292HOPOPTS, (ptr[1]+1)<<3, ptr); } if (np->rxopt.bits.odstopts && opt->dst0) { u8 *ptr = nh + opt->dst0; put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, (ptr[1]+1)<<3, ptr); } if (np->rxopt.bits.osrcrt && opt->srcrt) { struct ipv6_rt_hdr *rthdr = (struct ipv6_rt_hdr *)(nh + opt->srcrt); put_cmsg(msg, SOL_IPV6, IPV6_2292RTHDR, (rthdr->hdrlen+1) << 3, rthdr); } if (np->rxopt.bits.odstopts && opt->dst1) { u8 *ptr = nh + opt->dst1; put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, (ptr[1]+1)<<3, ptr); } if (np->rxopt.bits.rxorigdstaddr) { struct sockaddr_in6 sin6; __be16 _ports[2], *ports; ports = skb_header_pointer(skb, skb_transport_offset(skb), sizeof(_ports), &_ports); if (ports) { /* All current transport protocols have the port numbers in the * first four bytes of the transport header and this function is * written with this assumption in mind. */ sin6.sin6_family = AF_INET6; sin6.sin6_addr = ipv6_hdr(skb)->daddr; sin6.sin6_port = ports[1]; sin6.sin6_flowinfo = 0; sin6.sin6_scope_id = ipv6_iface_scope_id(&ipv6_hdr(skb)->daddr, opt->iif); put_cmsg(msg, SOL_IPV6, IPV6_ORIGDSTADDR, sizeof(sin6), &sin6); } } if (np->rxopt.bits.recvfragsize && opt->frag_max_size) { int val = opt->frag_max_size; put_cmsg(msg, SOL_IPV6, IPV6_RECVFRAGSIZE, sizeof(val), &val); } } void ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) { ip6_datagram_recv_common_ctl(sk, msg, skb); ip6_datagram_recv_specific_ctl(sk, msg, skb); } EXPORT_SYMBOL_GPL(ip6_datagram_recv_ctl); int ip6_datagram_send_ctl(struct net *net, struct sock *sk, struct msghdr *msg, struct flowi6 *fl6, struct ipcm6_cookie *ipc6) { struct in6_pktinfo *src_info; struct cmsghdr *cmsg; struct ipv6_rt_hdr *rthdr; struct ipv6_opt_hdr *hdr; struct ipv6_txoptions *opt = ipc6->opt; int len; int err = 0; for_each_cmsghdr(cmsg, msg) { int addr_type; if (!CMSG_OK(msg, cmsg)) { err = -EINVAL; goto exit_f; } if (cmsg->cmsg_level == SOL_SOCKET) { err = __sock_cmsg_send(sk, cmsg, &ipc6->sockc); if (err) return err; continue; } if (cmsg->cmsg_level != SOL_IPV6) continue; switch (cmsg->cmsg_type) { case IPV6_PKTINFO: case IPV6_2292PKTINFO: { struct net_device *dev = NULL; int src_idx; if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct in6_pktinfo))) { err = -EINVAL; goto exit_f; } src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg); src_idx = src_info->ipi6_ifindex; if (src_idx) { if (fl6->flowi6_oif && src_idx != fl6->flowi6_oif && (READ_ONCE(sk->sk_bound_dev_if) != fl6->flowi6_oif || !sk_dev_equal_l3scope(sk, src_idx))) return -EINVAL; fl6->flowi6_oif = src_idx; } addr_type = __ipv6_addr_type(&src_info->ipi6_addr); rcu_read_lock(); if (fl6->flowi6_oif) { dev = dev_get_by_index_rcu(net, fl6->flowi6_oif); if (!dev) { rcu_read_unlock(); return -ENODEV; } } else if (addr_type & IPV6_ADDR_LINKLOCAL) { rcu_read_unlock(); return -EINVAL; } if (addr_type != IPV6_ADDR_ANY) { int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL; if (!ipv6_can_nonlocal_bind(net, inet_sk(sk)) && !ipv6_chk_addr_and_flags(net, &src_info->ipi6_addr, dev, !strict, 0, IFA_F_TENTATIVE) && !ipv6_chk_acast_addr_src(net, dev, &src_info->ipi6_addr)) err = -EINVAL; else fl6->saddr = src_info->ipi6_addr; } rcu_read_unlock(); if (err) goto exit_f; break; } case IPV6_FLOWINFO: if (cmsg->cmsg_len < CMSG_LEN(4)) { err = -EINVAL; goto exit_f; } if (fl6->flowlabel&IPV6_FLOWINFO_MASK) { if ((fl6->flowlabel^*(__be32 *)CMSG_DATA(cmsg))&~IPV6_FLOWINFO_MASK) { err = -EINVAL; goto exit_f; } } fl6->flowlabel = IPV6_FLOWINFO_MASK & *(__be32 *)CMSG_DATA(cmsg); break; case IPV6_2292HOPOPTS: case IPV6_HOPOPTS: if (opt->hopopt || cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) { err = -EINVAL; goto exit_f; } hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg); len = ((hdr->hdrlen + 1) << 3); if (cmsg->cmsg_len < CMSG_LEN(len)) { err = -EINVAL; goto exit_f; } if (!ns_capable(net->user_ns, CAP_NET_RAW)) { err = -EPERM; goto exit_f; } opt->opt_nflen += len; opt->hopopt = hdr; break; case IPV6_2292DSTOPTS: if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) { err = -EINVAL; goto exit_f; } hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg); len = ((hdr->hdrlen + 1) << 3); if (cmsg->cmsg_len < CMSG_LEN(len)) { err = -EINVAL; goto exit_f; } if (!ns_capable(net->user_ns, CAP_NET_RAW)) { err = -EPERM; goto exit_f; } if (opt->dst1opt) { err = -EINVAL; goto exit_f; } opt->opt_flen += len; opt->dst1opt = hdr; break; case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) { err = -EINVAL; goto exit_f; } hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg); len = ((hdr->hdrlen + 1) << 3); if (cmsg->cmsg_len < CMSG_LEN(len)) { err = -EINVAL; goto exit_f; } if (!ns_capable(net->user_ns, CAP_NET_RAW)) { err = -EPERM; goto exit_f; } if (cmsg->cmsg_type == IPV6_DSTOPTS) { opt->opt_flen += len; opt->dst1opt = hdr; } else { opt->opt_nflen += len; opt->dst0opt = hdr; } break; case IPV6_2292RTHDR: case IPV6_RTHDR: if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_rt_hdr))) { err = -EINVAL; goto exit_f; } rthdr = (struct ipv6_rt_hdr *)CMSG_DATA(cmsg); switch (rthdr->type) { #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPV6_SRCRT_TYPE_2: if (rthdr->hdrlen != 2 || rthdr->segments_left != 1) { err = -EINVAL; goto exit_f; } break; #endif default: err = -EINVAL; goto exit_f; } len = ((rthdr->hdrlen + 1) << 3); if (cmsg->cmsg_len < CMSG_LEN(len)) { err = -EINVAL; goto exit_f; } /* segments left must also match */ if ((rthdr->hdrlen >> 1) != rthdr->segments_left) { err = -EINVAL; goto exit_f; } opt->opt_nflen += len; opt->srcrt = rthdr; if (cmsg->cmsg_type == IPV6_2292RTHDR && opt->dst1opt) { int dsthdrlen = ((opt->dst1opt->hdrlen+1)<<3); opt->opt_nflen += dsthdrlen; opt->dst0opt = opt->dst1opt; opt->dst1opt = NULL; opt->opt_flen -= dsthdrlen; } break; case IPV6_2292HOPLIMIT: case IPV6_HOPLIMIT: if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) { err = -EINVAL; goto exit_f; } ipc6->hlimit = *(int *)CMSG_DATA(cmsg); if (ipc6->hlimit < -1 || ipc6->hlimit > 0xff) { err = -EINVAL; goto exit_f; } break; case IPV6_TCLASS: { int tc; err = -EINVAL; if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) goto exit_f; tc = *(int *)CMSG_DATA(cmsg); if (tc < -1 || tc > 0xff) goto exit_f; err = 0; ipc6->tclass = tc; break; } case IPV6_DONTFRAG: { int df; err = -EINVAL; if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) goto exit_f; df = *(int *)CMSG_DATA(cmsg); if (df < 0 || df > 1) goto exit_f; err = 0; ipc6->dontfrag = df; break; } default: net_dbg_ratelimited("invalid cmsg type: %d\n", cmsg->cmsg_type); err = -EINVAL; goto exit_f; } } exit_f: return err; } EXPORT_SYMBOL_GPL(ip6_datagram_send_ctl); void __ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, __u16 srcp, __u16 destp, int rqueue, int bucket) { const struct in6_addr *dest, *src; dest = &sp->sk_v6_daddr; src = &sp->sk_v6_rcv_saddr; seq_printf(seq, "%5d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " "%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u\n", bucket, src->s6_addr32[0], src->s6_addr32[1], src->s6_addr32[2], src->s6_addr32[3], srcp, dest->s6_addr32[0], dest->s6_addr32[1], dest->s6_addr32[2], dest->s6_addr32[3], destp, sp->sk_state, sk_wmem_alloc_get(sp), rqueue, 0, 0L, 0, from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)), 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); }
522 490 75 2 67 2 9 76 819 168 1289 24 24 43 6 37 24 23 306 306 302 2 1210 409 209 696 60 224 608 606 695 504 107 433 23 23 13 10 2 8 8 2 6 8 8 8 4 78 78 69 1 10 1 1 1 68 377 375 367 68 339 78 3 14 14 4 2 14 11 17 18 10 16 1 4 1 4 6 12 4 19 7 12 4 9 3 5 3 4 4 4 5 34 34 8 2 11 15 23 2 2 5 2 11 2 226 2 4 2 2 209 14 29 14 3 20 11 4 10 16 2 41 15 11 109 103 10 2 203 2 196 1 7 4 1 98 101 38 10 28 1 1 27 186 17 167 4 57 36 40 5 7 7 5 4 36 6 34 34 2 16 13 17 5 14 11 45 45 29 3 13 2 2 1 12 21 21 2 22 16 22 45 339 148 229 147 145 2 35 129 8 24 114 2 119 7 115 3 1 2 1 227 2 14 226 12 215 193 33 33 9 1 1 7 4 3 1 33 33 18 225 225 2 194 113 2 3 1 2 1 3 4 1 1 3 3 721 612 304 248 54 224 27 2186 462 2186 1609 2041 578 475 548 142 186 366 114 114 114 114 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * IPv4 Forwarding Information Base: FIB frontend. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> */ #include <linux/module.h> #include <linux/uaccess.h> #include <linux/bitops.h> #include <linux/capability.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/errno.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/if_addr.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/cache.h> #include <linux/init.h> #include <linux/list.h> #include <linux/slab.h> #include <net/inet_dscp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> #include <net/tcp.h> #include <net/sock.h> #include <net/arp.h> #include <net/ip_fib.h> #include <net/nexthop.h> #include <net/rtnetlink.h> #include <net/xfrm.h> #include <net/l3mdev.h> #include <net/lwtunnel.h> #include <trace/events/fib.h> #ifndef CONFIG_IP_MULTIPLE_TABLES static int __net_init fib4_rules_init(struct net *net) { struct fib_table *local_table, *main_table; main_table = fib_trie_table(RT_TABLE_MAIN, NULL); if (!main_table) return -ENOMEM; local_table = fib_trie_table(RT_TABLE_LOCAL, main_table); if (!local_table) goto fail; hlist_add_head_rcu(&local_table->tb_hlist, &net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]); hlist_add_head_rcu(&main_table->tb_hlist, &net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]); return 0; fail: fib_free_table(main_table); return -ENOMEM; } #else struct fib_table *fib_new_table(struct net *net, u32 id) { struct fib_table *tb, *alias = NULL; unsigned int h; if (id == 0) id = RT_TABLE_MAIN; tb = fib_get_table(net, id); if (tb) return tb; if (id == RT_TABLE_LOCAL && !net->ipv4.fib_has_custom_rules) alias = fib_new_table(net, RT_TABLE_MAIN); tb = fib_trie_table(id, alias); if (!tb) return NULL; switch (id) { case RT_TABLE_MAIN: rcu_assign_pointer(net->ipv4.fib_main, tb); break; case RT_TABLE_DEFAULT: rcu_assign_pointer(net->ipv4.fib_default, tb); break; default: break; } h = id & (FIB_TABLE_HASHSZ - 1); hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]); return tb; } EXPORT_SYMBOL_GPL(fib_new_table); /* caller must hold either rtnl or rcu read lock */ struct fib_table *fib_get_table(struct net *net, u32 id) { struct fib_table *tb; struct hlist_head *head; unsigned int h; if (id == 0) id = RT_TABLE_MAIN; h = id & (FIB_TABLE_HASHSZ - 1); head = &net->ipv4.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb_hlist, lockdep_rtnl_is_held()) { if (tb->tb_id == id) return tb; } return NULL; } #endif /* CONFIG_IP_MULTIPLE_TABLES */ static void fib_replace_table(struct net *net, struct fib_table *old, struct fib_table *new) { #ifdef CONFIG_IP_MULTIPLE_TABLES switch (new->tb_id) { case RT_TABLE_MAIN: rcu_assign_pointer(net->ipv4.fib_main, new); break; case RT_TABLE_DEFAULT: rcu_assign_pointer(net->ipv4.fib_default, new); break; default: break; } #endif /* replace the old table in the hlist */ hlist_replace_rcu(&old->tb_hlist, &new->tb_hlist); } int fib_unmerge(struct net *net) { struct fib_table *old, *new, *main_table; /* attempt to fetch local table if it has been allocated */ old = fib_get_table(net, RT_TABLE_LOCAL); if (!old) return 0; new = fib_trie_unmerge(old); if (!new) return -ENOMEM; /* table is already unmerged */ if (new == old) return 0; /* replace merged table with clean table */ fib_replace_table(net, old, new); fib_free_table(old); /* attempt to fetch main table if it has been allocated */ main_table = fib_get_table(net, RT_TABLE_MAIN); if (!main_table) return 0; /* flush local entries from main table */ fib_table_flush_external(main_table); return 0; } void fib_flush(struct net *net) { int flushed = 0; unsigned int h; for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; struct hlist_node *tmp; struct fib_table *tb; hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) flushed += fib_table_flush(net, tb, false); } if (flushed) rt_cache_flush(net); } /* * Find address type as if only "dev" was present in the system. If * on_dev is NULL then all interfaces are taken into consideration. */ static inline unsigned int __inet_dev_addr_type(struct net *net, const struct net_device *dev, __be32 addr, u32 tb_id) { struct flowi4 fl4 = { .daddr = addr }; struct fib_result res; unsigned int ret = RTN_BROADCAST; struct fib_table *table; if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr)) return RTN_BROADCAST; if (ipv4_is_multicast(addr)) return RTN_MULTICAST; rcu_read_lock(); table = fib_get_table(net, tb_id); if (table) { ret = RTN_UNICAST; if (!fib_table_lookup(table, &fl4, &res, FIB_LOOKUP_NOREF)) { struct fib_nh_common *nhc = fib_info_nhc(res.fi, 0); if (!dev || dev == nhc->nhc_dev) ret = res.type; } } rcu_read_unlock(); return ret; } unsigned int inet_addr_type_table(struct net *net, __be32 addr, u32 tb_id) { return __inet_dev_addr_type(net, NULL, addr, tb_id); } EXPORT_SYMBOL(inet_addr_type_table); unsigned int inet_addr_type(struct net *net, __be32 addr) { return __inet_dev_addr_type(net, NULL, addr, RT_TABLE_LOCAL); } EXPORT_SYMBOL(inet_addr_type); unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, __be32 addr) { u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL; return __inet_dev_addr_type(net, dev, addr, rt_table); } EXPORT_SYMBOL(inet_dev_addr_type); /* inet_addr_type with dev == NULL but using the table from a dev * if one is associated */ unsigned int inet_addr_type_dev_table(struct net *net, const struct net_device *dev, __be32 addr) { u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL; return __inet_dev_addr_type(net, NULL, addr, rt_table); } EXPORT_SYMBOL(inet_addr_type_dev_table); __be32 fib_compute_spec_dst(struct sk_buff *skb) { struct net_device *dev = skb->dev; struct in_device *in_dev; struct fib_result res; struct rtable *rt; struct net *net; int scope; rt = skb_rtable(skb); if ((rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL)) == RTCF_LOCAL) return ip_hdr(skb)->daddr; in_dev = __in_dev_get_rcu(dev); net = dev_net(dev); scope = RT_SCOPE_UNIVERSE; if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) { bool vmark = in_dev && IN_DEV_SRC_VMARK(in_dev); struct flowi4 fl4 = { .flowi4_iif = LOOPBACK_IFINDEX, .flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev), .daddr = ip_hdr(skb)->saddr, .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip_hdr(skb))), .flowi4_scope = scope, .flowi4_mark = vmark ? skb->mark : 0, }; if (!fib_lookup(net, &fl4, &res, 0)) return fib_result_prefsrc(net, &res); } else { scope = RT_SCOPE_LINK; } return inet_select_addr(dev, ip_hdr(skb)->saddr, scope); } bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev) { bool dev_match = false; #ifdef CONFIG_IP_ROUTE_MULTIPATH if (unlikely(fi->nh)) { dev_match = nexthop_uses_dev(fi->nh, dev); } else { int ret; for (ret = 0; ret < fib_info_num_path(fi); ret++) { const struct fib_nh_common *nhc = fib_info_nhc(fi, ret); if (nhc_l3mdev_matches_dev(nhc, dev)) { dev_match = true; break; } } } #else if (fib_info_nhc(fi, 0)->nhc_dev == dev) dev_match = true; #endif return dev_match; } EXPORT_SYMBOL_GPL(fib_info_nh_uses_dev); /* Given (packet source, input interface) and optional (dst, oif, tos): * - (main) check, that source is valid i.e. not broadcast or our local * address. * - figure out what "logical" interface this packet arrived * and calculate "specific destination" address. * - check, that packet arrived from expected physical interface. * called with rcu_read_lock() */ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, dscp_t dscp, int oif, struct net_device *dev, int rpf, struct in_device *idev, u32 *itag) { struct net *net = dev_net(dev); enum skb_drop_reason reason; struct flow_keys flkeys; int ret, no_addr; struct fib_result res; struct flowi4 fl4; bool dev_match; fl4.flowi4_oif = 0; fl4.flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev); fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX; fl4.daddr = src; fl4.saddr = dst; fl4.flowi4_tos = inet_dscp_to_dsfield(dscp); fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_tun_key.tun_id = 0; fl4.flowi4_flags = 0; fl4.flowi4_uid = sock_net_uid(net, NULL); fl4.flowi4_multipath_hash = 0; no_addr = idev->ifa_list == NULL; fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0; if (!fib4_rules_early_flow_dissect(net, skb, &fl4, &flkeys)) { fl4.flowi4_proto = 0; fl4.fl4_sport = 0; fl4.fl4_dport = 0; } else { swap(fl4.fl4_sport, fl4.fl4_dport); } if (fib_lookup(net, &fl4, &res, 0)) goto last_resort; if (res.type != RTN_UNICAST) { if (res.type != RTN_LOCAL) { reason = SKB_DROP_REASON_IP_INVALID_SOURCE; goto e_inval; } else if (!IN_DEV_ACCEPT_LOCAL(idev)) { reason = SKB_DROP_REASON_IP_LOCAL_SOURCE; goto e_inval; } } fib_combine_itag(itag, &res); dev_match = fib_info_nh_uses_dev(res.fi, dev); /* This is not common, loopback packets retain skb_dst so normally they * would not even hit this slow path. */ dev_match = dev_match || (res.type == RTN_LOCAL && dev == net->loopback_dev); if (dev_match) { ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST; return ret; } if (no_addr) goto last_resort; if (rpf == 1) goto e_rpf; fl4.flowi4_oif = dev->ifindex; ret = 0; if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) { if (res.type == RTN_UNICAST) ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST; } return ret; last_resort: if (rpf) goto e_rpf; *itag = 0; return 0; e_inval: return -reason; e_rpf: return -SKB_DROP_REASON_IP_RPFILTER; } /* Ignore rp_filter for packets protected by IPsec. */ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, dscp_t dscp, int oif, struct net_device *dev, struct in_device *idev, u32 *itag) { int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev); struct net *net = dev_net(dev); if (!r && !fib_num_tclassid_users(net) && (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) { if (IN_DEV_ACCEPT_LOCAL(idev)) goto ok; /* with custom local routes in place, checking local addresses * only will be too optimistic, with custom rules, checking * local addresses only can be too strict, e.g. due to vrf */ if (net->ipv4.fib_has_custom_local_routes || fib4_has_custom_rules(net)) goto full_check; /* Within the same container, it is regarded as a martian source, * and the same host but different containers are not. */ if (inet_lookup_ifaddr_rcu(net, src)) return -SKB_DROP_REASON_IP_LOCAL_SOURCE; ok: *itag = 0; return 0; } full_check: return __fib_validate_source(skb, src, dst, dscp, oif, dev, r, idev, itag); } static inline __be32 sk_extract_addr(struct sockaddr *addr) { return ((struct sockaddr_in *) addr)->sin_addr.s_addr; } static int put_rtax(struct nlattr *mx, int len, int type, u32 value) { struct nlattr *nla; nla = (struct nlattr *) ((char *) mx + len); nla->nla_type = type; nla->nla_len = nla_attr_size(4); *(u32 *) nla_data(nla) = value; return len + nla_total_size(4); } static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, struct fib_config *cfg) { __be32 addr; int plen; memset(cfg, 0, sizeof(*cfg)); cfg->fc_nlinfo.nl_net = net; if (rt->rt_dst.sa_family != AF_INET) return -EAFNOSUPPORT; /* * Check mask for validity: * a) it must be contiguous. * b) destination must have all host bits clear. * c) if application forgot to set correct family (AF_INET), * reject request unless it is absolutely clear i.e. * both family and mask are zero. */ plen = 32; addr = sk_extract_addr(&rt->rt_dst); if (!(rt->rt_flags & RTF_HOST)) { __be32 mask = sk_extract_addr(&rt->rt_genmask); if (rt->rt_genmask.sa_family != AF_INET) { if (mask || rt->rt_genmask.sa_family) return -EAFNOSUPPORT; } if (bad_mask(mask, addr)) return -EINVAL; plen = inet_mask_len(mask); } cfg->fc_dst_len = plen; cfg->fc_dst = addr; if (cmd != SIOCDELRT) { cfg->fc_nlflags = NLM_F_CREATE; cfg->fc_protocol = RTPROT_BOOT; } if (rt->rt_metric) cfg->fc_priority = rt->rt_metric - 1; if (rt->rt_flags & RTF_REJECT) { cfg->fc_scope = RT_SCOPE_HOST; cfg->fc_type = RTN_UNREACHABLE; return 0; } cfg->fc_scope = RT_SCOPE_NOWHERE; cfg->fc_type = RTN_UNICAST; if (rt->rt_dev) { char *colon; struct net_device *dev; char devname[IFNAMSIZ]; if (copy_from_user(devname, rt->rt_dev, IFNAMSIZ-1)) return -EFAULT; devname[IFNAMSIZ-1] = 0; colon = strchr(devname, ':'); if (colon) *colon = 0; dev = __dev_get_by_name(net, devname); if (!dev) return -ENODEV; cfg->fc_oif = dev->ifindex; cfg->fc_table = l3mdev_fib_table(dev); if (colon) { const struct in_ifaddr *ifa; struct in_device *in_dev; in_dev = __in_dev_get_rtnl_net(dev); if (!in_dev) return -ENODEV; *colon = ':'; in_dev_for_each_ifa_rtnl_net(net, ifa, in_dev) { if (strcmp(ifa->ifa_label, devname) == 0) break; } if (!ifa) return -ENODEV; cfg->fc_prefsrc = ifa->ifa_local; } } addr = sk_extract_addr(&rt->rt_gateway); if (rt->rt_gateway.sa_family == AF_INET && addr) { unsigned int addr_type; cfg->fc_gw4 = addr; cfg->fc_gw_family = AF_INET; addr_type = inet_addr_type_table(net, addr, cfg->fc_table); if (rt->rt_flags & RTF_GATEWAY && addr_type == RTN_UNICAST) cfg->fc_scope = RT_SCOPE_UNIVERSE; } if (!cfg->fc_table) cfg->fc_table = RT_TABLE_MAIN; if (cmd == SIOCDELRT) return 0; if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw_family) return -EINVAL; if (cfg->fc_scope == RT_SCOPE_NOWHERE) cfg->fc_scope = RT_SCOPE_LINK; if (rt->rt_flags & (RTF_MTU | RTF_WINDOW | RTF_IRTT)) { struct nlattr *mx; int len = 0; mx = kcalloc(3, nla_total_size(4), GFP_KERNEL); if (!mx) return -ENOMEM; if (rt->rt_flags & RTF_MTU) len = put_rtax(mx, len, RTAX_ADVMSS, rt->rt_mtu - 40); if (rt->rt_flags & RTF_WINDOW) len = put_rtax(mx, len, RTAX_WINDOW, rt->rt_window); if (rt->rt_flags & RTF_IRTT) len = put_rtax(mx, len, RTAX_RTT, rt->rt_irtt << 3); cfg->fc_mx = mx; cfg->fc_mx_len = len; } return 0; } /* * Handle IP routing ioctl calls. * These are used to manipulate the routing tables */ int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt) { struct fib_config cfg; int err; switch (cmd) { case SIOCADDRT: /* Add a route */ case SIOCDELRT: /* Delete a route */ if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; rtnl_net_lock(net); err = rtentry_to_fib_config(net, cmd, rt, &cfg); if (err == 0) { struct fib_table *tb; if (cmd == SIOCDELRT) { tb = fib_get_table(net, cfg.fc_table); if (tb) err = fib_table_delete(net, tb, &cfg, NULL); else err = -ESRCH; } else { tb = fib_new_table(net, cfg.fc_table); if (tb) err = fib_table_insert(net, tb, &cfg, NULL); else err = -ENOBUFS; } /* allocated by rtentry_to_fib_config() */ kfree(cfg.fc_mx); } rtnl_net_unlock(net); return err; } return -EINVAL; } const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { [RTA_UNSPEC] = { .strict_start_type = RTA_DPORT + 1 }, [RTA_DST] = { .type = NLA_U32 }, [RTA_SRC] = { .type = NLA_U32 }, [RTA_IIF] = { .type = NLA_U32 }, [RTA_OIF] = { .type = NLA_U32 }, [RTA_GATEWAY] = { .type = NLA_U32 }, [RTA_PRIORITY] = { .type = NLA_U32 }, [RTA_PREFSRC] = { .type = NLA_U32 }, [RTA_METRICS] = { .type = NLA_NESTED }, [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, [RTA_FLOW] = { .type = NLA_U32 }, [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, [RTA_ENCAP] = { .type = NLA_NESTED }, [RTA_UID] = { .type = NLA_U32 }, [RTA_MARK] = { .type = NLA_U32 }, [RTA_TABLE] = { .type = NLA_U32 }, [RTA_IP_PROTO] = { .type = NLA_U8 }, [RTA_SPORT] = { .type = NLA_U16 }, [RTA_DPORT] = { .type = NLA_U16 }, [RTA_NH_ID] = { .type = NLA_U32 }, }; int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla, struct netlink_ext_ack *extack) { struct rtvia *via; int alen; if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr)) { NL_SET_ERR_MSG(extack, "Invalid attribute length for RTA_VIA"); return -EINVAL; } via = nla_data(nla); alen = nla_len(nla) - offsetof(struct rtvia, rtvia_addr); switch (via->rtvia_family) { case AF_INET: if (alen != sizeof(__be32)) { NL_SET_ERR_MSG(extack, "Invalid IPv4 address in RTA_VIA"); return -EINVAL; } cfg->fc_gw_family = AF_INET; cfg->fc_gw4 = *((__be32 *)via->rtvia_addr); break; case AF_INET6: #if IS_ENABLED(CONFIG_IPV6) if (alen != sizeof(struct in6_addr)) { NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_VIA"); return -EINVAL; } cfg->fc_gw_family = AF_INET6; cfg->fc_gw6 = *((struct in6_addr *)via->rtvia_addr); #else NL_SET_ERR_MSG(extack, "IPv6 support not enabled in kernel"); return -EINVAL; #endif break; default: NL_SET_ERR_MSG(extack, "Unsupported address family in RTA_VIA"); return -EINVAL; } return 0; } static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, struct fib_config *cfg, struct netlink_ext_ack *extack) { bool has_gw = false, has_via = false; struct nlattr *attr; int err, remaining; struct rtmsg *rtm; err = nlmsg_validate_deprecated(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy, extack); if (err < 0) goto errout; memset(cfg, 0, sizeof(*cfg)); rtm = nlmsg_data(nlh); if (!inet_validate_dscp(rtm->rtm_tos)) { NL_SET_ERR_MSG(extack, "Invalid dsfield (tos): ECN bits must be 0"); err = -EINVAL; goto errout; } cfg->fc_dscp = inet_dsfield_to_dscp(rtm->rtm_tos); cfg->fc_dst_len = rtm->rtm_dst_len; cfg->fc_table = rtm->rtm_table; cfg->fc_protocol = rtm->rtm_protocol; cfg->fc_scope = rtm->rtm_scope; cfg->fc_type = rtm->rtm_type; cfg->fc_flags = rtm->rtm_flags; cfg->fc_nlflags = nlh->nlmsg_flags; cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; cfg->fc_nlinfo.nlh = nlh; cfg->fc_nlinfo.nl_net = net; if (cfg->fc_type > RTN_MAX) { NL_SET_ERR_MSG(extack, "Invalid route type"); err = -EINVAL; goto errout; } nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) { switch (nla_type(attr)) { case RTA_DST: cfg->fc_dst = nla_get_be32(attr); break; case RTA_OIF: cfg->fc_oif = nla_get_u32(attr); break; case RTA_GATEWAY: has_gw = true; cfg->fc_gw4 = nla_get_be32(attr); if (cfg->fc_gw4) cfg->fc_gw_family = AF_INET; break; case RTA_VIA: has_via = true; err = fib_gw_from_via(cfg, attr, extack); if (err) goto errout; break; case RTA_PRIORITY: cfg->fc_priority = nla_get_u32(attr); break; case RTA_PREFSRC: cfg->fc_prefsrc = nla_get_be32(attr); break; case RTA_METRICS: cfg->fc_mx = nla_data(attr); cfg->fc_mx_len = nla_len(attr); break; case RTA_MULTIPATH: err = lwtunnel_valid_encap_type_attr(nla_data(attr), nla_len(attr), extack, false); if (err < 0) goto errout; cfg->fc_mp = nla_data(attr); cfg->fc_mp_len = nla_len(attr); break; case RTA_FLOW: cfg->fc_flow = nla_get_u32(attr); break; case RTA_TABLE: cfg->fc_table = nla_get_u32(attr); break; case RTA_ENCAP: cfg->fc_encap = attr; break; case RTA_ENCAP_TYPE: cfg->fc_encap_type = nla_get_u16(attr); err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack, false); if (err < 0) goto errout; break; case RTA_NH_ID: cfg->fc_nh_id = nla_get_u32(attr); break; } } if (cfg->fc_dst_len > 32) { NL_SET_ERR_MSG(extack, "Invalid prefix length"); err = -EINVAL; goto errout; } if (cfg->fc_dst_len < 32 && (ntohl(cfg->fc_dst) << cfg->fc_dst_len)) { NL_SET_ERR_MSG(extack, "Invalid prefix for given prefix length"); err = -EINVAL; goto errout; } if (cfg->fc_nh_id) { if (cfg->fc_oif || cfg->fc_gw_family || cfg->fc_encap || cfg->fc_mp) { NL_SET_ERR_MSG(extack, "Nexthop specification and nexthop id are mutually exclusive"); err = -EINVAL; goto errout; } } if (has_gw && has_via) { NL_SET_ERR_MSG(extack, "Nexthop configuration can not contain both GATEWAY and VIA"); err = -EINVAL; goto errout; } if (!cfg->fc_table) cfg->fc_table = RT_TABLE_MAIN; return 0; errout: return err; } static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct fib_config cfg; struct fib_table *tb; int err; err = rtm_to_fib_config(net, skb, nlh, &cfg, extack); if (err < 0) goto errout; rtnl_net_lock(net); if (cfg.fc_nh_id && !nexthop_find_by_id(net, cfg.fc_nh_id)) { NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); err = -EINVAL; goto unlock; } tb = fib_get_table(net, cfg.fc_table); if (!tb) { NL_SET_ERR_MSG(extack, "FIB table does not exist"); err = -ESRCH; goto unlock; } err = fib_table_delete(net, tb, &cfg, extack); unlock: rtnl_net_unlock(net); errout: return err; } static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct fib_config cfg; struct fib_table *tb; int err; err = rtm_to_fib_config(net, skb, nlh, &cfg, extack); if (err < 0) goto errout; rtnl_net_lock(net); tb = fib_new_table(net, cfg.fc_table); if (!tb) { err = -ENOBUFS; goto unlock; } err = fib_table_insert(net, tb, &cfg, extack); if (!err && cfg.fc_type == RTN_LOCAL) net->ipv4.fib_has_custom_local_routes = true; unlock: rtnl_net_unlock(net); errout: return err; } int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, struct fib_dump_filter *filter, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; struct nlattr *tb[RTA_MAX + 1]; struct rtmsg *rtm; int err, i; if (filter->rtnl_held) ASSERT_RTNL(); rtm = nlmsg_payload(nlh, sizeof(*rtm)); if (!rtm) { NL_SET_ERR_MSG(extack, "Invalid header for FIB dump request"); return -EINVAL; } if (rtm->rtm_dst_len || rtm->rtm_src_len || rtm->rtm_tos || rtm->rtm_scope) { NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request"); return -EINVAL; } if (rtm->rtm_flags & ~(RTM_F_CLONED | RTM_F_PREFIX)) { NL_SET_ERR_MSG(extack, "Invalid flags for FIB dump request"); return -EINVAL; } if (rtm->rtm_flags & RTM_F_CLONED) filter->dump_routes = false; else filter->dump_exceptions = false; filter->flags = rtm->rtm_flags; filter->protocol = rtm->rtm_protocol; filter->rt_type = rtm->rtm_type; filter->table_id = rtm->rtm_table; err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, extack); if (err < 0) return err; for (i = 0; i <= RTA_MAX; ++i) { int ifindex; if (!tb[i]) continue; switch (i) { case RTA_TABLE: filter->table_id = nla_get_u32(tb[i]); break; case RTA_OIF: ifindex = nla_get_u32(tb[i]); if (filter->rtnl_held) filter->dev = __dev_get_by_index(net, ifindex); else filter->dev = dev_get_by_index_rcu(net, ifindex); if (!filter->dev) return -ENODEV; break; default: NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request"); return -EINVAL; } } if (filter->flags || filter->protocol || filter->rt_type || filter->table_id || filter->dev) { filter->filter_set = 1; cb->answer_flags = NLM_F_DUMP_FILTERED; } return 0; } EXPORT_SYMBOL_GPL(ip_valid_fib_dump_req); static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { struct fib_dump_filter filter = { .dump_routes = true, .dump_exceptions = true, .rtnl_held = false, }; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); unsigned int h, s_h; unsigned int e = 0, s_e; struct fib_table *tb; struct hlist_head *head; int dumped = 0, err = 0; rcu_read_lock(); if (cb->strict_check) { err = ip_valid_fib_dump_req(net, nlh, &filter, cb); if (err < 0) goto unlock; } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) { struct rtmsg *rtm = nlmsg_data(nlh); filter.flags = rtm->rtm_flags & (RTM_F_PREFIX | RTM_F_CLONED); } /* ipv4 does not use prefix flag */ if (filter.flags & RTM_F_PREFIX) goto unlock; if (filter.table_id) { tb = fib_get_table(net, filter.table_id); if (!tb) { if (rtnl_msg_family(cb->nlh) != PF_INET) goto unlock; NL_SET_ERR_MSG(cb->extack, "ipv4: FIB table does not exist"); err = -ENOENT; goto unlock; } err = fib_table_dump(tb, skb, cb, &filter); goto unlock; } s_h = cb->args[0]; s_e = cb->args[1]; err = 0; for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) { e = 0; head = &net->ipv4.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb_hlist) { if (e < s_e) goto next; if (dumped) memset(&cb->args[2], 0, sizeof(cb->args) - 2 * sizeof(cb->args[0])); err = fib_table_dump(tb, skb, cb, &filter); if (err < 0) goto out; dumped = 1; next: e++; } } out: cb->args[1] = e; cb->args[0] = h; unlock: rcu_read_unlock(); return err; } /* Prepare and feed intra-kernel routing request. * Really, it should be netlink message, but :-( netlink * can be not configured, so that we feed it directly * to fib engine. It is legal, because all events occur * only when netlink is already locked. */ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa, u32 rt_priority) { struct net *net = dev_net(ifa->ifa_dev->dev); u32 tb_id = l3mdev_fib_table(ifa->ifa_dev->dev); struct fib_table *tb; struct fib_config cfg = { .fc_protocol = RTPROT_KERNEL, .fc_type = type, .fc_dst = dst, .fc_dst_len = dst_len, .fc_priority = rt_priority, .fc_prefsrc = ifa->ifa_local, .fc_oif = ifa->ifa_dev->dev->ifindex, .fc_nlflags = NLM_F_CREATE | NLM_F_APPEND, .fc_nlinfo = { .nl_net = net, }, }; if (!tb_id) tb_id = (type == RTN_UNICAST) ? RT_TABLE_MAIN : RT_TABLE_LOCAL; tb = fib_new_table(net, tb_id); if (!tb) return; cfg.fc_table = tb->tb_id; if (type != RTN_LOCAL) cfg.fc_scope = RT_SCOPE_LINK; else cfg.fc_scope = RT_SCOPE_HOST; if (cmd == RTM_NEWROUTE) fib_table_insert(net, tb, &cfg, NULL); else fib_table_delete(net, tb, &cfg, NULL); } void fib_add_ifaddr(struct in_ifaddr *ifa) { struct in_device *in_dev = ifa->ifa_dev; struct net_device *dev = in_dev->dev; struct in_ifaddr *prim = ifa; __be32 mask = ifa->ifa_mask; __be32 addr = ifa->ifa_local; __be32 prefix = ifa->ifa_address & mask; if (ifa->ifa_flags & IFA_F_SECONDARY) { prim = inet_ifa_byprefix(in_dev, prefix, mask); if (!prim) { pr_warn("%s: bug: prim == NULL\n", __func__); return; } } fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim, 0); if (!(dev->flags & IFF_UP)) return; /* Add broadcast address, if it is explicitly assigned. */ if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF)) { fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim, 0); arp_invalidate(dev, ifa->ifa_broadcast, false); } if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) && (prefix != addr || ifa->ifa_prefixlen < 32)) { if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE)) fib_magic(RTM_NEWROUTE, dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim, ifa->ifa_rt_priority); /* Add the network broadcast address, when it makes sense */ if (ifa->ifa_prefixlen < 31) { fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask, 32, prim, 0); arp_invalidate(dev, prefix | ~mask, false); } } } void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric) { __be32 prefix = ifa->ifa_address & ifa->ifa_mask; struct in_device *in_dev = ifa->ifa_dev; struct net_device *dev = in_dev->dev; if (!(dev->flags & IFF_UP) || ifa->ifa_flags & (IFA_F_SECONDARY | IFA_F_NOPREFIXROUTE) || ipv4_is_zeronet(prefix) || (prefix == ifa->ifa_local && ifa->ifa_prefixlen == 32)) return; /* add the new */ fib_magic(RTM_NEWROUTE, dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, prefix, ifa->ifa_prefixlen, ifa, new_metric); /* delete the old */ fib_magic(RTM_DELROUTE, dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, prefix, ifa->ifa_prefixlen, ifa, ifa->ifa_rt_priority); } /* Delete primary or secondary address. * Optionally, on secondary address promotion consider the addresses * from subnet iprim as deleted, even if they are in device list. * In this case the secondary ifa can be in device list. */ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim) { struct in_device *in_dev = ifa->ifa_dev; struct net_device *dev = in_dev->dev; struct in_ifaddr *ifa1; struct in_ifaddr *prim = ifa, *prim1 = NULL; __be32 brd = ifa->ifa_address | ~ifa->ifa_mask; __be32 any = ifa->ifa_address & ifa->ifa_mask; #define LOCAL_OK 1 #define BRD_OK 2 #define BRD0_OK 4 #define BRD1_OK 8 unsigned int ok = 0; int subnet = 0; /* Primary network */ int gone = 1; /* Address is missing */ int same_prefsrc = 0; /* Another primary with same IP */ if (ifa->ifa_flags & IFA_F_SECONDARY) { prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask); if (!prim) { /* if the device has been deleted, we don't perform * address promotion */ if (!in_dev->dead) pr_warn("%s: bug: prim == NULL\n", __func__); return; } if (iprim && iprim != prim) { pr_warn("%s: bug: iprim != prim\n", __func__); return; } } else if (!ipv4_is_zeronet(any) && (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) { if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE)) fib_magic(RTM_DELROUTE, dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, any, ifa->ifa_prefixlen, prim, 0); subnet = 1; } if (in_dev->dead) goto no_promotions; /* Deletion is more complicated than add. * We should take care of not to delete too much :-) * * Scan address list to be sure that addresses are really gone. */ rcu_read_lock(); in_dev_for_each_ifa_rcu(ifa1, in_dev) { if (ifa1 == ifa) { /* promotion, keep the IP */ gone = 0; continue; } /* Ignore IFAs from our subnet */ if (iprim && ifa1->ifa_mask == iprim->ifa_mask && inet_ifa_match(ifa1->ifa_address, iprim)) continue; /* Ignore ifa1 if it uses different primary IP (prefsrc) */ if (ifa1->ifa_flags & IFA_F_SECONDARY) { /* Another address from our subnet? */ if (ifa1->ifa_mask == prim->ifa_mask && inet_ifa_match(ifa1->ifa_address, prim)) prim1 = prim; else { /* We reached the secondaries, so * same_prefsrc should be determined. */ if (!same_prefsrc) continue; /* Search new prim1 if ifa1 is not * using the current prim1 */ if (!prim1 || ifa1->ifa_mask != prim1->ifa_mask || !inet_ifa_match(ifa1->ifa_address, prim1)) prim1 = inet_ifa_byprefix(in_dev, ifa1->ifa_address, ifa1->ifa_mask); if (!prim1) continue; if (prim1->ifa_local != prim->ifa_local) continue; } } else { if (prim->ifa_local != ifa1->ifa_local) continue; prim1 = ifa1; if (prim != prim1) same_prefsrc = 1; } if (ifa->ifa_local == ifa1->ifa_local) ok |= LOCAL_OK; if (ifa->ifa_broadcast == ifa1->ifa_broadcast) ok |= BRD_OK; if (brd == ifa1->ifa_broadcast) ok |= BRD1_OK; if (any == ifa1->ifa_broadcast) ok |= BRD0_OK; /* primary has network specific broadcasts */ if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) { __be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask; __be32 any1 = ifa1->ifa_address & ifa1->ifa_mask; if (!ipv4_is_zeronet(any1)) { if (ifa->ifa_broadcast == brd1 || ifa->ifa_broadcast == any1) ok |= BRD_OK; if (brd == brd1 || brd == any1) ok |= BRD1_OK; if (any == brd1 || any == any1) ok |= BRD0_OK; } } } rcu_read_unlock(); no_promotions: if (!(ok & BRD_OK)) fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim, 0); if (subnet && ifa->ifa_prefixlen < 31) { if (!(ok & BRD1_OK)) fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim, 0); if (!(ok & BRD0_OK)) fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim, 0); } if (!(ok & LOCAL_OK)) { unsigned int addr_type; fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim, 0); /* Check, that this local address finally disappeared. */ addr_type = inet_addr_type_dev_table(dev_net(dev), dev, ifa->ifa_local); if (gone && addr_type != RTN_LOCAL) { /* And the last, but not the least thing. * We must flush stray FIB entries. * * First of all, we scan fib_info list searching * for stray nexthop entries, then ignite fib_flush. */ if (fib_sync_down_addr(dev, ifa->ifa_local)) fib_flush(dev_net(dev)); } } #undef LOCAL_OK #undef BRD_OK #undef BRD0_OK #undef BRD1_OK } static void nl_fib_lookup(struct net *net, struct fib_result_nl *frn) { struct fib_result res; struct flowi4 fl4 = { .flowi4_mark = frn->fl_mark, .daddr = frn->fl_addr, .flowi4_tos = frn->fl_tos & INET_DSCP_MASK, .flowi4_scope = frn->fl_scope, }; struct fib_table *tb; rcu_read_lock(); tb = fib_get_table(net, frn->tb_id_in); frn->err = -ENOENT; if (tb) { local_bh_disable(); frn->tb_id = tb->tb_id; frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); if (!frn->err) { frn->prefixlen = res.prefixlen; frn->nh_sel = res.nh_sel; frn->type = res.type; frn->scope = res.scope; } local_bh_enable(); } rcu_read_unlock(); } static void nl_fib_input(struct sk_buff *skb) { struct net *net; struct fib_result_nl *frn; struct nlmsghdr *nlh; u32 portid; net = sock_net(skb->sk); nlh = nlmsg_hdr(skb); if (skb->len < nlmsg_total_size(sizeof(*frn)) || skb->len < nlh->nlmsg_len || nlmsg_len(nlh) < sizeof(*frn)) return; skb = netlink_skb_clone(skb, GFP_KERNEL); if (!skb) return; nlh = nlmsg_hdr(skb); frn = nlmsg_data(nlh); nl_fib_lookup(net, frn); portid = NETLINK_CB(skb).portid; /* netlink portid */ NETLINK_CB(skb).portid = 0; /* from kernel */ NETLINK_CB(skb).dst_group = 0; /* unicast */ nlmsg_unicast(net->ipv4.fibnl, skb, portid); } static int __net_init nl_fib_lookup_init(struct net *net) { struct sock *sk; struct netlink_kernel_cfg cfg = { .input = nl_fib_input, }; sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, &cfg); if (!sk) return -EAFNOSUPPORT; net->ipv4.fibnl = sk; return 0; } static void nl_fib_lookup_exit(struct net *net) { netlink_kernel_release(net->ipv4.fibnl); net->ipv4.fibnl = NULL; } static void fib_disable_ip(struct net_device *dev, unsigned long event, bool force) { if (fib_sync_down_dev(dev, event, force)) fib_flush(dev_net(dev)); else rt_cache_flush(dev_net(dev)); arp_ifdown(dev); } static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct in_ifaddr *ifa = ptr; struct net_device *dev = ifa->ifa_dev->dev; struct net *net = dev_net(dev); switch (event) { case NETDEV_UP: fib_add_ifaddr(ifa); #ifdef CONFIG_IP_ROUTE_MULTIPATH fib_sync_up(dev, RTNH_F_DEAD); #endif atomic_inc(&net->ipv4.dev_addr_genid); rt_cache_flush(net); break; case NETDEV_DOWN: fib_del_ifaddr(ifa, NULL); atomic_inc(&net->ipv4.dev_addr_genid); if (!ifa->ifa_dev->ifa_list) { /* Last address was deleted from this interface. * Disable IP. */ fib_disable_ip(dev, event, true); } else { rt_cache_flush(net); } break; } return NOTIFY_DONE; } static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netdev_notifier_changeupper_info *upper_info = ptr; struct netdev_notifier_info_ext *info_ext = ptr; struct in_device *in_dev; struct net *net = dev_net(dev); struct in_ifaddr *ifa; unsigned int flags; if (event == NETDEV_UNREGISTER) { fib_disable_ip(dev, event, true); rt_flush_dev(dev); return NOTIFY_DONE; } in_dev = __in_dev_get_rtnl(dev); if (!in_dev) return NOTIFY_DONE; switch (event) { case NETDEV_UP: in_dev_for_each_ifa_rtnl(ifa, in_dev) { fib_add_ifaddr(ifa); } #ifdef CONFIG_IP_ROUTE_MULTIPATH fib_sync_up(dev, RTNH_F_DEAD); #endif atomic_inc(&net->ipv4.dev_addr_genid); rt_cache_flush(net); break; case NETDEV_DOWN: fib_disable_ip(dev, event, false); break; case NETDEV_CHANGE: flags = dev_get_flags(dev); if (flags & (IFF_RUNNING | IFF_LOWER_UP)) fib_sync_up(dev, RTNH_F_LINKDOWN); else fib_sync_down_dev(dev, event, false); rt_cache_flush(net); break; case NETDEV_CHANGEMTU: fib_sync_mtu(dev, info_ext->ext.mtu); rt_cache_flush(net); break; case NETDEV_CHANGEUPPER: upper_info = ptr; /* flush all routes if dev is linked to or unlinked from * an L3 master device (e.g., VRF) */ if (upper_info->upper_dev && netif_is_l3_master(upper_info->upper_dev)) fib_disable_ip(dev, NETDEV_DOWN, true); break; } return NOTIFY_DONE; } static struct notifier_block fib_inetaddr_notifier = { .notifier_call = fib_inetaddr_event, }; static struct notifier_block fib_netdev_notifier = { .notifier_call = fib_netdev_event, }; static int __net_init ip_fib_net_init(struct net *net) { int err; size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ; err = fib4_notifier_init(net); if (err) return err; #ifdef CONFIG_IP_ROUTE_MULTIPATH /* Default to 3-tuple */ net->ipv4.sysctl_fib_multipath_hash_fields = FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK; #endif /* Avoid false sharing : Use at least a full cache line */ size = max_t(size_t, size, L1_CACHE_BYTES); net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL); if (!net->ipv4.fib_table_hash) { err = -ENOMEM; goto err_table_hash_alloc; } err = fib4_rules_init(net); if (err < 0) goto err_rules_init; return 0; err_rules_init: kfree(net->ipv4.fib_table_hash); err_table_hash_alloc: fib4_notifier_exit(net); return err; } static void ip_fib_net_exit(struct net *net) { int i; ASSERT_RTNL_NET(net); #ifdef CONFIG_IP_MULTIPLE_TABLES RCU_INIT_POINTER(net->ipv4.fib_main, NULL); RCU_INIT_POINTER(net->ipv4.fib_default, NULL); #endif /* Destroy the tables in reverse order to guarantee that the * local table, ID 255, is destroyed before the main table, ID * 254. This is necessary as the local table may contain * references to data contained in the main table. */ for (i = FIB_TABLE_HASHSZ - 1; i >= 0; i--) { struct hlist_head *head = &net->ipv4.fib_table_hash[i]; struct hlist_node *tmp; struct fib_table *tb; hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) { hlist_del(&tb->tb_hlist); fib_table_flush(net, tb, true); fib_free_table(tb); } } #ifdef CONFIG_IP_MULTIPLE_TABLES fib4_rules_exit(net); #endif kfree(net->ipv4.fib_table_hash); fib4_notifier_exit(net); } static int __net_init fib_net_init(struct net *net) { int error; #ifdef CONFIG_IP_ROUTE_CLASSID atomic_set(&net->ipv4.fib_num_tclassid_users, 0); #endif error = ip_fib_net_init(net); if (error < 0) goto out; error = fib4_semantics_init(net); if (error) goto out_semantics; error = nl_fib_lookup_init(net); if (error < 0) goto out_nlfl; error = fib_proc_init(net); if (error < 0) goto out_proc; out: return error; out_proc: nl_fib_lookup_exit(net); out_nlfl: fib4_semantics_exit(net); out_semantics: rtnl_net_lock(net); ip_fib_net_exit(net); rtnl_net_unlock(net); goto out; } static void __net_exit fib_net_exit(struct net *net) { fib_proc_exit(net); nl_fib_lookup_exit(net); } static void __net_exit fib_net_exit_batch(struct list_head *net_list) { struct net *net; rtnl_lock(); list_for_each_entry(net, net_list, exit_list) { __rtnl_net_lock(net); ip_fib_net_exit(net); __rtnl_net_unlock(net); } rtnl_unlock(); list_for_each_entry(net, net_list, exit_list) fib4_semantics_exit(net); } static struct pernet_operations fib_net_ops = { .init = fib_net_init, .exit = fib_net_exit, .exit_batch = fib_net_exit_batch, }; static const struct rtnl_msg_handler fib_rtnl_msg_handlers[] __initconst = { {.protocol = PF_INET, .msgtype = RTM_NEWROUTE, .doit = inet_rtm_newroute, .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET, .msgtype = RTM_DELROUTE, .doit = inet_rtm_delroute, .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET, .msgtype = RTM_GETROUTE, .dumpit = inet_dump_fib, .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, }; void __init ip_fib_init(void) { fib_trie_init(); register_pernet_subsys(&fib_net_ops); register_netdevice_notifier(&fib_netdev_notifier); register_inetaddr_notifier(&fib_inetaddr_notifier); rtnl_register_many(fib_rtnl_msg_handlers); }
1 1 1 1 1 1 1 1 1 1 1 1 1 10 7 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 /* * IPv4 specific functions of netfilter core * * Rusty Russell (C) 2000 -- This code is GPL. * Patrick McHardy (C) 2006-2012 */ #include <linux/kernel.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/ip.h> #include <linux/skbuff.h> #include <linux/gfp.h> #include <linux/export.h> #include <net/route.h> #include <net/xfrm.h> #include <net/ip.h> #include <net/inet_dscp.h> #include <net/netfilter/nf_queue.h> /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int addr_type) { const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; struct flowi4 fl4 = {}; __be32 saddr = iph->saddr; __u8 flags; struct net_device *dev = skb_dst(skb)->dev; struct flow_keys flkeys; unsigned int hh_len; sk = sk_to_full_sk(sk); flags = sk ? inet_sk_flowi_flags(sk) : 0; if (addr_type == RTN_UNSPEC) addr_type = inet_addr_type_dev_table(net, dev, saddr); if (addr_type == RTN_LOCAL || addr_type == RTN_UNICAST) flags |= FLOWI_FLAG_ANYSRC; else saddr = 0; /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause * packets with foreign saddr to appear on the NF_INET_LOCAL_OUT hook. */ fl4.daddr = iph->daddr; fl4.saddr = saddr; fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)); fl4.flowi4_oif = sk ? sk->sk_bound_dev_if : 0; fl4.flowi4_l3mdev = l3mdev_master_ifindex(dev); fl4.flowi4_mark = skb->mark; fl4.flowi4_flags = flags; fib4_rules_early_flow_dissect(net, skb, &fl4, &flkeys); rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) return PTR_ERR(rt); /* Drop old route. */ skb_dst_drop(skb); skb_dst_set(skb, &rt->dst); if (skb_dst(skb)->error) return skb_dst(skb)->error; #ifdef CONFIG_XFRM if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && xfrm_decode_session(net, skb, flowi4_to_flowi(&fl4), AF_INET) == 0) { struct dst_entry *dst = skb_dst(skb); skb_dst_set(skb, NULL); dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), sk, 0); if (IS_ERR(dst)) return PTR_ERR(dst); skb_dst_set(skb, dst); } #endif /* Change in oif may mean change in hh_len. */ hh_len = skb_dst(skb)->dev->hard_header_len; if (skb_headroom(skb) < hh_len && pskb_expand_head(skb, HH_DATA_ALIGN(hh_len - skb_headroom(skb)), 0, GFP_ATOMIC)) return -ENOMEM; return 0; } EXPORT_SYMBOL(ip_route_me_harder); int nf_ip_route(struct net *net, struct dst_entry **dst, struct flowi *fl, bool strict __always_unused) { struct rtable *rt = ip_route_output_key(net, &fl->u.ip4); if (IS_ERR(rt)) return PTR_ERR(rt); *dst = &rt->dst; return 0; } EXPORT_SYMBOL_GPL(nf_ip_route);
7 64 64 6 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 /* * DRBG based on NIST SP800-90A * * Copyright Stephan Mueller <smueller@chronox.de>, 2014 * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, and the entire permission notice in its entirety, * including the disclaimer of warranties. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior * written permission. * * ALTERNATIVELY, this product may be distributed under the terms of * the GNU General Public License, in which case the provisions of the GPL are * required INSTEAD OF the above restrictions. (This clause is * necessary due to a potential bad interaction between the GPL and * the restrictions contained in a BSD-style copyright.) * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ #ifndef _DRBG_H #define _DRBG_H #include <linux/random.h> #include <linux/scatterlist.h> #include <crypto/hash.h> #include <crypto/skcipher.h> #include <linux/module.h> #include <linux/crypto.h> #include <linux/slab.h> #include <crypto/internal/rng.h> #include <crypto/rng.h> #include <linux/fips.h> #include <linux/mutex.h> #include <linux/list.h> #include <linux/workqueue.h> /* * Concatenation Helper and string operation helper * * SP800-90A requires the concatenation of different data. To avoid copying * buffers around or allocate additional memory, the following data structure * is used to point to the original memory with its size. In addition, it * is used to build a linked list. The linked list defines the concatenation * of individual buffers. The order of memory block referenced in that * linked list determines the order of concatenation. */ struct drbg_string { const unsigned char *buf; size_t len; struct list_head list; }; static inline void drbg_string_fill(struct drbg_string *string, const unsigned char *buf, size_t len) { string->buf = buf; string->len = len; INIT_LIST_HEAD(&string->list); } struct drbg_state; typedef uint32_t drbg_flag_t; struct drbg_core { drbg_flag_t flags; /* flags for the cipher */ __u8 statelen; /* maximum state length */ __u8 blocklen_bytes; /* block size of output in bytes */ char cra_name[CRYPTO_MAX_ALG_NAME]; /* mapping to kernel crypto API */ /* kernel crypto API backend cipher name */ char backend_cra_name[CRYPTO_MAX_ALG_NAME]; }; struct drbg_state_ops { int (*update)(struct drbg_state *drbg, struct list_head *seed, int reseed); int (*generate)(struct drbg_state *drbg, unsigned char *buf, unsigned int buflen, struct list_head *addtl); int (*crypto_init)(struct drbg_state *drbg); int (*crypto_fini)(struct drbg_state *drbg); }; struct drbg_test_data { struct drbg_string *testentropy; /* TEST PARAMETER: test entropy */ }; enum drbg_seed_state { DRBG_SEED_STATE_UNSEEDED, DRBG_SEED_STATE_PARTIAL, /* Seeded with !rng_is_initialized() */ DRBG_SEED_STATE_FULL, }; struct drbg_state { struct mutex drbg_mutex; /* lock around DRBG */ unsigned char *V; /* internal state 10.1.1.1 1a) */ unsigned char *Vbuf; /* hash: static value 10.1.1.1 1b) hmac / ctr: key */ unsigned char *C; unsigned char *Cbuf; /* Number of RNG requests since last reseed -- 10.1.1.1 1c) */ size_t reseed_ctr; size_t reseed_threshold; /* some memory the DRBG can use for its operation */ unsigned char *scratchpad; unsigned char *scratchpadbuf; void *priv_data; /* Cipher handle */ struct crypto_skcipher *ctr_handle; /* CTR mode cipher handle */ struct skcipher_request *ctr_req; /* CTR mode request handle */ __u8 *outscratchpadbuf; /* CTR mode output scratchpad */ __u8 *outscratchpad; /* CTR mode aligned outbuf */ struct crypto_wait ctr_wait; /* CTR mode async wait obj */ struct scatterlist sg_in, sg_out; /* CTR mode SGLs */ enum drbg_seed_state seeded; /* DRBG fully seeded? */ unsigned long last_seed_time; bool pr; /* Prediction resistance enabled? */ bool fips_primed; /* Continuous test primed? */ unsigned char *prev; /* FIPS 140-2 continuous test value */ struct crypto_rng *jent; const struct drbg_state_ops *d_ops; const struct drbg_core *core; struct drbg_string test_data; }; static inline __u8 drbg_statelen(struct drbg_state *drbg) { if (drbg && drbg->core) return drbg->core->statelen; return 0; } static inline __u8 drbg_blocklen(struct drbg_state *drbg) { if (drbg && drbg->core) return drbg->core->blocklen_bytes; return 0; } static inline __u8 drbg_keylen(struct drbg_state *drbg) { if (drbg && drbg->core) return (drbg->core->statelen - drbg->core->blocklen_bytes); return 0; } static inline size_t drbg_max_request_bytes(struct drbg_state *drbg) { /* SP800-90A requires the limit 2**19 bits, but we return bytes */ return (1 << 16); } static inline size_t drbg_max_addtl(struct drbg_state *drbg) { /* SP800-90A requires 2**35 bytes additional info str / pers str */ #if (__BITS_PER_LONG == 32) /* * SP800-90A allows smaller maximum numbers to be returned -- we * return SIZE_MAX - 1 to allow the verification of the enforcement * of this value in drbg_healthcheck_sanity. */ return (SIZE_MAX - 1); #else return (1UL<<35); #endif } static inline size_t drbg_max_requests(struct drbg_state *drbg) { /* SP800-90A requires 2**48 maximum requests before reseeding */ return (1<<20); } /* * This is a wrapper to the kernel crypto API function of * crypto_rng_generate() to allow the caller to provide additional data. * * @drng DRBG handle -- see crypto_rng_get_bytes * @outbuf output buffer -- see crypto_rng_get_bytes * @outlen length of output buffer -- see crypto_rng_get_bytes * @addtl_input additional information string input buffer * @addtllen length of additional information string buffer * * return * see crypto_rng_get_bytes */ static inline int crypto_drbg_get_bytes_addtl(struct crypto_rng *drng, unsigned char *outbuf, unsigned int outlen, struct drbg_string *addtl) { return crypto_rng_generate(drng, addtl->buf, addtl->len, outbuf, outlen); } /* * TEST code * * This is a wrapper to the kernel crypto API function of * crypto_rng_generate() to allow the caller to provide additional data and * allow furnishing of test_data * * @drng DRBG handle -- see crypto_rng_get_bytes * @outbuf output buffer -- see crypto_rng_get_bytes * @outlen length of output buffer -- see crypto_rng_get_bytes * @addtl_input additional information string input buffer * @addtllen length of additional information string buffer * @test_data filled test data * * return * see crypto_rng_get_bytes */ static inline int crypto_drbg_get_bytes_addtl_test(struct crypto_rng *drng, unsigned char *outbuf, unsigned int outlen, struct drbg_string *addtl, struct drbg_test_data *test_data) { crypto_rng_set_entropy(drng, test_data->testentropy->buf, test_data->testentropy->len); return crypto_rng_generate(drng, addtl->buf, addtl->len, outbuf, outlen); } /* * TEST code * * This is a wrapper to the kernel crypto API function of * crypto_rng_reset() to allow the caller to provide test_data * * @drng DRBG handle -- see crypto_rng_reset * @pers personalization string input buffer * @perslen length of additional information string buffer * @test_data filled test data * * return * see crypto_rng_reset */ static inline int crypto_drbg_reset_test(struct crypto_rng *drng, struct drbg_string *pers, struct drbg_test_data *test_data) { crypto_rng_set_entropy(drng, test_data->testentropy->buf, test_data->testentropy->len); return crypto_rng_reset(drng, pers->buf, pers->len); } /* DRBG type flags */ #define DRBG_CTR ((drbg_flag_t)1<<0) #define DRBG_HMAC ((drbg_flag_t)1<<1) #define DRBG_HASH ((drbg_flag_t)1<<2) #define DRBG_TYPE_MASK (DRBG_CTR | DRBG_HMAC | DRBG_HASH) /* DRBG strength flags */ #define DRBG_STRENGTH128 ((drbg_flag_t)1<<3) #define DRBG_STRENGTH192 ((drbg_flag_t)1<<4) #define DRBG_STRENGTH256 ((drbg_flag_t)1<<5) #define DRBG_STRENGTH_MASK (DRBG_STRENGTH128 | DRBG_STRENGTH192 | \ DRBG_STRENGTH256) enum drbg_prefixes { DRBG_PREFIX0 = 0x00, DRBG_PREFIX1, DRBG_PREFIX2, DRBG_PREFIX3 }; #endif /* _DRBG_H */
5 3 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2009 Patrick McHardy <kaber@trash.net> * * Development of this code funded by Astaro AG (http://www.astaro.com/) */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/list.h> #include <linux/rbtree.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> struct nft_lookup { struct nft_set *set; u8 sreg; u8 dreg; bool dreg_set; bool invert; struct nft_set_binding binding; }; #ifdef CONFIG_MITIGATION_RETPOLINE bool nft_set_do_lookup(const struct net *net, const struct nft_set *set, const u32 *key, const struct nft_set_ext **ext) { if (set->ops == &nft_set_hash_fast_type.ops) return nft_hash_lookup_fast(net, set, key, ext); if (set->ops == &nft_set_hash_type.ops) return nft_hash_lookup(net, set, key, ext); if (set->ops == &nft_set_rhash_type.ops) return nft_rhash_lookup(net, set, key, ext); if (set->ops == &nft_set_bitmap_type.ops) return nft_bitmap_lookup(net, set, key, ext); if (set->ops == &nft_set_pipapo_type.ops) return nft_pipapo_lookup(net, set, key, ext); #if defined(CONFIG_X86_64) && !defined(CONFIG_UML) if (set->ops == &nft_set_pipapo_avx2_type.ops) return nft_pipapo_avx2_lookup(net, set, key, ext); #endif if (set->ops == &nft_set_rbtree_type.ops) return nft_rbtree_lookup(net, set, key, ext); WARN_ON_ONCE(1); return set->ops->lookup(net, set, key, ext); } EXPORT_SYMBOL_GPL(nft_set_do_lookup); #endif void nft_lookup_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_lookup *priv = nft_expr_priv(expr); const struct nft_set *set = priv->set; const struct nft_set_ext *ext = NULL; const struct net *net = nft_net(pkt); bool found; found = nft_set_do_lookup(net, set, &regs->data[priv->sreg], &ext) ^ priv->invert; if (!found) { ext = nft_set_catchall_lookup(net, set); if (!ext) { regs->verdict.code = NFT_BREAK; return; } } if (ext) { if (priv->dreg_set) nft_data_copy(&regs->data[priv->dreg], nft_set_ext_data(ext), set->dlen); nft_set_elem_update_expr(ext, regs, pkt); } } static const struct nla_policy nft_lookup_policy[NFTA_LOOKUP_MAX + 1] = { [NFTA_LOOKUP_SET] = { .type = NLA_STRING, .len = NFT_SET_MAXNAMELEN - 1 }, [NFTA_LOOKUP_SET_ID] = { .type = NLA_U32 }, [NFTA_LOOKUP_SREG] = { .type = NLA_U32 }, [NFTA_LOOKUP_DREG] = { .type = NLA_U32 }, [NFTA_LOOKUP_FLAGS] = NLA_POLICY_MASK(NLA_BE32, NFT_LOOKUP_F_INV), }; static int nft_lookup_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_lookup *priv = nft_expr_priv(expr); u8 genmask = nft_genmask_next(ctx->net); struct nft_set *set; u32 flags; int err; if (tb[NFTA_LOOKUP_SET] == NULL || tb[NFTA_LOOKUP_SREG] == NULL) return -EINVAL; set = nft_set_lookup_global(ctx->net, ctx->table, tb[NFTA_LOOKUP_SET], tb[NFTA_LOOKUP_SET_ID], genmask); if (IS_ERR(set)) return PTR_ERR(set); err = nft_parse_register_load(ctx, tb[NFTA_LOOKUP_SREG], &priv->sreg, set->klen); if (err < 0) return err; if (tb[NFTA_LOOKUP_FLAGS]) { flags = ntohl(nla_get_be32(tb[NFTA_LOOKUP_FLAGS])); if (flags & NFT_LOOKUP_F_INV) priv->invert = true; } if (tb[NFTA_LOOKUP_DREG] != NULL) { if (priv->invert) return -EINVAL; if (!(set->flags & NFT_SET_MAP)) return -EINVAL; err = nft_parse_register_store(ctx, tb[NFTA_LOOKUP_DREG], &priv->dreg, NULL, nft_set_datatype(set), set->dlen); if (err < 0) return err; priv->dreg_set = true; } else if (set->flags & NFT_SET_MAP) { /* Map given, but user asks for lookup only (i.e. to * ignore value assoicated with key). * * This makes no sense for anonymous maps since they are * scoped to the rule, but for named sets this can be useful. */ if (set->flags & NFT_SET_ANONYMOUS) return -EINVAL; } priv->binding.flags = set->flags & NFT_SET_MAP; err = nf_tables_bind_set(ctx, set, &priv->binding); if (err < 0) return err; priv->set = set; return 0; } static void nft_lookup_deactivate(const struct nft_ctx *ctx, const struct nft_expr *expr, enum nft_trans_phase phase) { struct nft_lookup *priv = nft_expr_priv(expr); nf_tables_deactivate_set(ctx, priv->set, &priv->binding, phase); } static void nft_lookup_activate(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_lookup *priv = nft_expr_priv(expr); nf_tables_activate_set(ctx, priv->set); } static void nft_lookup_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_lookup *priv = nft_expr_priv(expr); nf_tables_destroy_set(ctx, priv->set); } static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_lookup *priv = nft_expr_priv(expr); u32 flags = priv->invert ? NFT_LOOKUP_F_INV : 0; if (nla_put_string(skb, NFTA_LOOKUP_SET, priv->set->name)) goto nla_put_failure; if (nft_dump_register(skb, NFTA_LOOKUP_SREG, priv->sreg)) goto nla_put_failure; if (priv->dreg_set) if (nft_dump_register(skb, NFTA_LOOKUP_DREG, priv->dreg)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_LOOKUP_FLAGS, htonl(flags))) goto nla_put_failure; return 0; nla_put_failure: return -1; } static int nft_lookup_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_lookup *priv = nft_expr_priv(expr); struct nft_set_iter iter; if (!(priv->set->flags & NFT_SET_MAP) || priv->set->dtype != NFT_DATA_VERDICT) return 0; iter.genmask = nft_genmask_next(ctx->net); iter.type = NFT_ITER_UPDATE; iter.skip = 0; iter.count = 0; iter.err = 0; iter.fn = nft_setelem_validate; priv->set->ops->walk(ctx, priv->set, &iter); if (!iter.err) iter.err = nft_set_catchall_validate(ctx, priv->set); if (iter.err < 0) return iter.err; return 0; } static bool nft_lookup_reduce(struct nft_regs_track *track, const struct nft_expr *expr) { const struct nft_lookup *priv = nft_expr_priv(expr); if (priv->set->flags & NFT_SET_MAP) nft_reg_track_cancel(track, priv->dreg, priv->set->dlen); return false; } static const struct nft_expr_ops nft_lookup_ops = { .type = &nft_lookup_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_lookup)), .eval = nft_lookup_eval, .init = nft_lookup_init, .activate = nft_lookup_activate, .deactivate = nft_lookup_deactivate, .destroy = nft_lookup_destroy, .dump = nft_lookup_dump, .validate = nft_lookup_validate, .reduce = nft_lookup_reduce, }; struct nft_expr_type nft_lookup_type __read_mostly = { .name = "lookup", .ops = &nft_lookup_ops, .policy = nft_lookup_policy, .maxattr = NFTA_LOOKUP_MAX, .owner = THIS_MODULE, };
81 8 9 9 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Trace point definitions for the RDMA Connect Manager. * * Author: Chuck Lever <chuck.lever@oracle.com> * * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. */ #undef TRACE_SYSTEM #define TRACE_SYSTEM rdma_cma #if !defined(_TRACE_RDMA_CMA_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_RDMA_CMA_H #include <linux/tracepoint.h> #include <trace/misc/rdma.h> DECLARE_EVENT_CLASS(cma_fsm_class, TP_PROTO( const struct rdma_id_private *id_priv ), TP_ARGS(id_priv), TP_STRUCT__entry( __field(u32, cm_id) __field(u32, tos) __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( __entry->cm_id = id_priv->res.id; __entry->tos = id_priv->tos; memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, sizeof(struct sockaddr_in6)); memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, sizeof(struct sockaddr_in6)); ), TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u", __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __entry->tos ) ); #define DEFINE_CMA_FSM_EVENT(name) \ DEFINE_EVENT(cma_fsm_class, cm_##name, \ TP_PROTO( \ const struct rdma_id_private *id_priv \ ), \ TP_ARGS(id_priv)) DEFINE_CMA_FSM_EVENT(send_rtu); DEFINE_CMA_FSM_EVENT(send_rej); DEFINE_CMA_FSM_EVENT(send_mra); DEFINE_CMA_FSM_EVENT(send_sidr_req); DEFINE_CMA_FSM_EVENT(send_sidr_rep); DEFINE_CMA_FSM_EVENT(disconnect); DEFINE_CMA_FSM_EVENT(sent_drep); DEFINE_CMA_FSM_EVENT(sent_dreq); DEFINE_CMA_FSM_EVENT(id_destroy); TRACE_EVENT(cm_id_attach, TP_PROTO( const struct rdma_id_private *id_priv, const struct ib_device *device ), TP_ARGS(id_priv, device), TP_STRUCT__entry( __field(u32, cm_id) __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) __string(devname, device->name) ), TP_fast_assign( __entry->cm_id = id_priv->res.id; memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, sizeof(struct sockaddr_in6)); memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, sizeof(struct sockaddr_in6)); __assign_str(devname); ), TP_printk("cm.id=%u src=%pISpc dst=%pISpc device=%s", __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __get_str(devname) ) ); DECLARE_EVENT_CLASS(cma_qp_class, TP_PROTO( const struct rdma_id_private *id_priv ), TP_ARGS(id_priv), TP_STRUCT__entry( __field(u32, cm_id) __field(u32, tos) __field(u32, qp_num) __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( __entry->cm_id = id_priv->res.id; __entry->tos = id_priv->tos; __entry->qp_num = id_priv->qp_num; memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, sizeof(struct sockaddr_in6)); memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, sizeof(struct sockaddr_in6)); ), TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u qp_num=%u", __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __entry->tos, __entry->qp_num ) ); #define DEFINE_CMA_QP_EVENT(name) \ DEFINE_EVENT(cma_qp_class, cm_##name, \ TP_PROTO( \ const struct rdma_id_private *id_priv \ ), \ TP_ARGS(id_priv)) DEFINE_CMA_QP_EVENT(send_req); DEFINE_CMA_QP_EVENT(send_rep); DEFINE_CMA_QP_EVENT(qp_destroy); /* * enum ib_wp_type, from include/rdma/ib_verbs.h */ #define IB_QP_TYPE_LIST \ ib_qp_type(SMI) \ ib_qp_type(GSI) \ ib_qp_type(RC) \ ib_qp_type(UC) \ ib_qp_type(UD) \ ib_qp_type(RAW_IPV6) \ ib_qp_type(RAW_ETHERTYPE) \ ib_qp_type(RAW_PACKET) \ ib_qp_type(XRC_INI) \ ib_qp_type_end(XRC_TGT) #undef ib_qp_type #undef ib_qp_type_end #define ib_qp_type(x) TRACE_DEFINE_ENUM(IB_QPT_##x); #define ib_qp_type_end(x) TRACE_DEFINE_ENUM(IB_QPT_##x); IB_QP_TYPE_LIST #undef ib_qp_type #undef ib_qp_type_end #define ib_qp_type(x) { IB_QPT_##x, #x }, #define ib_qp_type_end(x) { IB_QPT_##x, #x } #define rdma_show_qp_type(x) \ __print_symbolic(x, IB_QP_TYPE_LIST) TRACE_EVENT(cm_qp_create, TP_PROTO( const struct rdma_id_private *id_priv, const struct ib_pd *pd, const struct ib_qp_init_attr *qp_init_attr, int rc ), TP_ARGS(id_priv, pd, qp_init_attr, rc), TP_STRUCT__entry( __field(u32, cm_id) __field(u32, pd_id) __field(u32, tos) __field(u32, qp_num) __field(u32, send_wr) __field(u32, recv_wr) __field(int, rc) __field(unsigned long, qp_type) __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( __entry->cm_id = id_priv->res.id; __entry->pd_id = pd->res.id; __entry->tos = id_priv->tos; __entry->send_wr = qp_init_attr->cap.max_send_wr; __entry->recv_wr = qp_init_attr->cap.max_recv_wr; __entry->rc = rc; if (!rc) { __entry->qp_num = id_priv->qp_num; __entry->qp_type = id_priv->id.qp_type; } else { __entry->qp_num = 0; __entry->qp_type = 0; } memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, sizeof(struct sockaddr_in6)); memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, sizeof(struct sockaddr_in6)); ), TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u pd.id=%u qp_type=%s" " send_wr=%u recv_wr=%u qp_num=%u rc=%d", __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __entry->tos, __entry->pd_id, rdma_show_qp_type(__entry->qp_type), __entry->send_wr, __entry->recv_wr, __entry->qp_num, __entry->rc ) ); TRACE_EVENT(cm_req_handler, TP_PROTO( const struct rdma_id_private *id_priv, int event ), TP_ARGS(id_priv, event), TP_STRUCT__entry( __field(u32, cm_id) __field(u32, tos) __field(unsigned long, event) __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( __entry->cm_id = id_priv->res.id; __entry->tos = id_priv->tos; __entry->event = event; memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, sizeof(struct sockaddr_in6)); memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, sizeof(struct sockaddr_in6)); ), TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u %s (%lu)", __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __entry->tos, rdma_show_ib_cm_event(__entry->event), __entry->event ) ); TRACE_EVENT(cm_event_handler, TP_PROTO( const struct rdma_id_private *id_priv, const struct rdma_cm_event *event ), TP_ARGS(id_priv, event), TP_STRUCT__entry( __field(u32, cm_id) __field(u32, tos) __field(unsigned long, event) __field(int, status) __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( __entry->cm_id = id_priv->res.id; __entry->tos = id_priv->tos; __entry->event = event->event; __entry->status = event->status; memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, sizeof(struct sockaddr_in6)); memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, sizeof(struct sockaddr_in6)); ), TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u %s (%lu/%d)", __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __entry->tos, rdma_show_cm_event(__entry->event), __entry->event, __entry->status ) ); TRACE_EVENT(cm_event_done, TP_PROTO( const struct rdma_id_private *id_priv, const struct rdma_cm_event *event, int result ), TP_ARGS(id_priv, event, result), TP_STRUCT__entry( __field(u32, cm_id) __field(u32, tos) __field(unsigned long, event) __field(int, result) __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( __entry->cm_id = id_priv->res.id; __entry->tos = id_priv->tos; __entry->event = event->event; __entry->result = result; memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, sizeof(struct sockaddr_in6)); memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, sizeof(struct sockaddr_in6)); ), TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u %s consumer returns %d", __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __entry->tos, rdma_show_cm_event(__entry->event), __entry->result ) ); DECLARE_EVENT_CLASS(cma_client_class, TP_PROTO( const struct ib_device *device ), TP_ARGS(device), TP_STRUCT__entry( __string(name, device->name) ), TP_fast_assign( __assign_str(name); ), TP_printk("device name=%s", __get_str(name) ) ); #define DEFINE_CMA_CLIENT_EVENT(name) \ DEFINE_EVENT(cma_client_class, cm_##name, \ TP_PROTO( \ const struct ib_device *device \ ), \ TP_ARGS(device)) DEFINE_CMA_CLIENT_EVENT(add_one); DEFINE_CMA_CLIENT_EVENT(remove_one); #endif /* _TRACE_RDMA_CMA_H */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #define TRACE_INCLUDE_FILE cma_trace #include <trace/define_trace.h>
13 13 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 // SPDX-License-Identifier: GPL-2.0-only /* * Landlock - Credential hooks * * Copyright © 2017-2020 Mickaël Salaün <mic@digikod.net> * Copyright © 2018-2020 ANSSI * Copyright © 2024-2025 Microsoft Corporation */ #include <linux/binfmts.h> #include <linux/cred.h> #include <linux/lsm_hooks.h> #include "common.h" #include "cred.h" #include "ruleset.h" #include "setup.h" static void hook_cred_transfer(struct cred *const new, const struct cred *const old) { const struct landlock_cred_security *const old_llcred = landlock_cred(old); if (old_llcred->domain) { landlock_get_ruleset(old_llcred->domain); *landlock_cred(new) = *old_llcred; } } static int hook_cred_prepare(struct cred *const new, const struct cred *const old, const gfp_t gfp) { hook_cred_transfer(new, old); return 0; } static void hook_cred_free(struct cred *const cred) { struct landlock_ruleset *const dom = landlock_cred(cred)->domain; if (dom) landlock_put_ruleset_deferred(dom); } #ifdef CONFIG_AUDIT static int hook_bprm_creds_for_exec(struct linux_binprm *const bprm) { /* Resets for each execution. */ landlock_cred(bprm->cred)->domain_exec = 0; return 0; } #endif /* CONFIG_AUDIT */ static struct security_hook_list landlock_hooks[] __ro_after_init = { LSM_HOOK_INIT(cred_prepare, hook_cred_prepare), LSM_HOOK_INIT(cred_transfer, hook_cred_transfer), LSM_HOOK_INIT(cred_free, hook_cred_free), #ifdef CONFIG_AUDIT LSM_HOOK_INIT(bprm_creds_for_exec, hook_bprm_creds_for_exec), #endif /* CONFIG_AUDIT */ }; __init void landlock_add_cred_hooks(void) { security_add_hooks(landlock_hooks, ARRAY_SIZE(landlock_hooks), &landlock_lsmid); }
4 26 11 1 30 10 20 30 5 4 1 59 59 59 59 59 24 24 11 4 2 5 2 9 9 14 1 8 8 16 11 4 5 6 2 9 3 1 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 // SPDX-License-Identifier: GPL-2.0-only /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Florian La Roche, <flla@stud.uni-sb.de> * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> * Linus Torvalds, <torvalds@cs.helsinki.fi> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Matthew Dillon, <dillon@apollo.west.oic.com> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Jorge Cwik, <jorge@laser.satlink.net> */ #include <net/tcp.h> #include <net/xfrm.h> #include <net/busy_poll.h> #include <net/rstreason.h> static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) { if (seq == s_win) return true; if (after(end_seq, s_win) && before(seq, e_win)) return true; return seq == e_win && seq == end_seq; } static enum tcp_tw_status tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw, const struct sk_buff *skb, int mib_idx) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); if (!tcp_oow_rate_limited(twsk_net(tw), skb, mib_idx, &tcptw->tw_last_oow_ack_time)) { /* Send ACK. Note, we do not put the bucket, * it will be released by caller. */ return TCP_TW_ACK_OOW; } /* We are rate-limiting, so just release the tw sock and drop skb. */ inet_twsk_put(tw); return TCP_TW_SUCCESS; } static void twsk_rcv_nxt_update(struct tcp_timewait_sock *tcptw, u32 seq, u32 rcv_nxt) { #ifdef CONFIG_TCP_AO struct tcp_ao_info *ao; ao = rcu_dereference(tcptw->ao_info); if (unlikely(ao && seq < rcv_nxt)) WRITE_ONCE(ao->rcv_sne, ao->rcv_sne + 1); #endif WRITE_ONCE(tcptw->tw_rcv_nxt, seq); } /* * * Main purpose of TIME-WAIT state is to close connection gracefully, * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN * (and, probably, tail of data) and one or more our ACKs are lost. * * What is TIME-WAIT timeout? It is associated with maximal packet * lifetime in the internet, which results in wrong conclusion, that * it is set to catch "old duplicate segments" wandering out of their path. * It is not quite correct. This timeout is calculated so that it exceeds * maximal retransmission timeout enough to allow to lose one (or more) * segments sent by peer and our ACKs. This time may be calculated from RTO. * * When TIME-WAIT socket receives RST, it means that another end * finally closed and we are allowed to kill TIME-WAIT too. * * Second purpose of TIME-WAIT is catching old duplicate segments. * Well, certainly it is pure paranoia, but if we load TIME-WAIT * with this semantics, we MUST NOT kill TIME-WAIT state with RSTs. * * If we invented some more clever way to catch duplicates * (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs. * * The algorithm below is based on FORMAL INTERPRETATION of RFCs. * When you compare it to RFCs, please, read section SEGMENT ARRIVES * from the very beginning. * * NOTE. With recycling (and later with fin-wait-2) TW bucket * is _not_ stateless. It means, that strictly speaking we must * spinlock it. I do not want! Well, probability of misbehaviour * is ridiculously low and, seems, we could use some mb() tricks * to avoid misread sequence numbers, states etc. --ANK * * We don't need to initialize tmp_out.sack_ok as we don't use the results */ enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, const struct tcphdr *th, u32 *tw_isn, enum skb_drop_reason *drop_reason) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); u32 rcv_nxt = READ_ONCE(tcptw->tw_rcv_nxt); struct tcp_options_received tmp_opt; bool paws_reject = false; int ts_recent_stamp; tmp_opt.saw_tstamp = 0; ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp); if (th->doff > (sizeof(*th) >> 2) && ts_recent_stamp) { tcp_parse_options(twsk_net(tw), skb, &tmp_opt, 0, NULL); if (tmp_opt.saw_tstamp) { if (tmp_opt.rcv_tsecr) tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset; tmp_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent); tmp_opt.ts_recent_stamp = ts_recent_stamp; paws_reject = tcp_paws_reject(&tmp_opt, th->rst); } } if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) { /* Just repeat all the checks of tcp_rcv_state_process() */ /* Out of window, send ACK */ if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, rcv_nxt, rcv_nxt + tcptw->tw_rcv_wnd)) return tcp_timewait_check_oow_rate_limit( tw, skb, LINUX_MIB_TCPACKSKIPPEDFINWAIT2); if (th->rst) goto kill; if (th->syn && !before(TCP_SKB_CB(skb)->seq, rcv_nxt)) return TCP_TW_RST; /* Dup ACK? */ if (!th->ack || !after(TCP_SKB_CB(skb)->end_seq, rcv_nxt) || TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) { inet_twsk_put(tw); return TCP_TW_SUCCESS; } /* New data or FIN. If new data arrive after half-duplex close, * reset. */ if (!th->fin || TCP_SKB_CB(skb)->end_seq != rcv_nxt + 1) return TCP_TW_RST; /* FIN arrived, enter true time-wait state. */ WRITE_ONCE(tw->tw_substate, TCP_TIME_WAIT); twsk_rcv_nxt_update(tcptw, TCP_SKB_CB(skb)->end_seq, rcv_nxt); if (tmp_opt.saw_tstamp) { u64 ts = tcp_clock_ms(); WRITE_ONCE(tw->tw_entry_stamp, ts); WRITE_ONCE(tcptw->tw_ts_recent_stamp, div_u64(ts, MSEC_PER_SEC)); WRITE_ONCE(tcptw->tw_ts_recent, tmp_opt.rcv_tsval); } inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); return TCP_TW_ACK; } /* * Now real TIME-WAIT state. * * RFC 1122: * "When a connection is [...] on TIME-WAIT state [...] * [a TCP] MAY accept a new SYN from the remote TCP to * reopen the connection directly, if it: * * (1) assigns its initial sequence number for the new * connection to be larger than the largest sequence * number it used on the previous connection incarnation, * and * * (2) returns to TIME-WAIT state if the SYN turns out * to be an old duplicate". */ if (!paws_reject && (TCP_SKB_CB(skb)->seq == rcv_nxt && (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) { /* In window segment, it may be only reset or bare ack. */ if (th->rst) { /* This is TIME_WAIT assassination, in two flavors. * Oh well... nobody has a sufficient solution to this * protocol bug yet. */ if (!READ_ONCE(twsk_net(tw)->ipv4.sysctl_tcp_rfc1337)) { kill: inet_twsk_deschedule_put(tw); return TCP_TW_SUCCESS; } } else { inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); } if (tmp_opt.saw_tstamp) { WRITE_ONCE(tcptw->tw_ts_recent, tmp_opt.rcv_tsval); WRITE_ONCE(tcptw->tw_ts_recent_stamp, ktime_get_seconds()); } inet_twsk_put(tw); return TCP_TW_SUCCESS; } /* Out of window segment. All the segments are ACKed immediately. The only exception is new SYN. We accept it, if it is not old duplicate and we are not in danger to be killed by delayed old duplicates. RFC check is that it has newer sequence number works at rates <40Mbit/sec. However, if paws works, it is reliable AND even more, we even may relax silly seq space cutoff. RED-PEN: we violate main RFC requirement, if this SYN will appear old duplicate (i.e. we receive RST in reply to SYN-ACK), we must return socket to time-wait state. It is not good, but not fatal yet. */ if (th->syn && !th->rst && !th->ack && !paws_reject && (after(TCP_SKB_CB(skb)->seq, rcv_nxt) || (tmp_opt.saw_tstamp && (s32)(READ_ONCE(tcptw->tw_ts_recent) - tmp_opt.rcv_tsval) < 0))) { u32 isn = tcptw->tw_snd_nxt + 65535 + 2; if (isn == 0) isn++; *tw_isn = isn; return TCP_TW_SYN; } if (paws_reject) { *drop_reason = SKB_DROP_REASON_TCP_RFC7323_TW_PAWS; __NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWS_TW_REJECTED); } if (!th->rst) { /* In this case we must reset the TIMEWAIT timer. * * If it is ACKless SYN it may be both old duplicate * and new good SYN with random sequence number <rcv_nxt. * Do not reschedule in the last case. */ if (paws_reject || th->ack) inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); return tcp_timewait_check_oow_rate_limit( tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT); } inet_twsk_put(tw); return TCP_TW_SUCCESS; } EXPORT_IPV6_MOD(tcp_timewait_state_process); static void tcp_time_wait_init(struct sock *sk, struct tcp_timewait_sock *tcptw) { #ifdef CONFIG_TCP_MD5SIG const struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *key; /* * The timewait bucket does not have the key DB from the * sock structure. We just make a quick copy of the * md5 key being used (if indeed we are using one) * so the timewait ack generating code has the key. */ tcptw->tw_md5_key = NULL; if (!static_branch_unlikely(&tcp_md5_needed.key)) return; key = tp->af_specific->md5_lookup(sk, sk); if (key) { tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC); if (!tcptw->tw_md5_key) return; if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) goto out_free; tcp_md5_add_sigpool(); } return; out_free: WARN_ON_ONCE(1); kfree(tcptw->tw_md5_key); tcptw->tw_md5_key = NULL; #endif } /* * Move a socket to time-wait or dead fin-wait-2 state. */ void tcp_time_wait(struct sock *sk, int state, int timeo) { const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct inet_timewait_sock *tw; tw = inet_twsk_alloc(sk, &net->ipv4.tcp_death_row, state); if (tw) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); tw->tw_transparent = inet_test_bit(TRANSPARENT, sk); tw->tw_mark = sk->sk_mark; tw->tw_priority = READ_ONCE(sk->sk_priority); tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; /* refreshed when we enter true TIME-WAIT state */ tw->tw_entry_stamp = tcp_time_stamp_ms(tp); tcptw->tw_rcv_nxt = tp->rcv_nxt; tcptw->tw_snd_nxt = tp->snd_nxt; tcptw->tw_rcv_wnd = tcp_receive_window(tp); tcptw->tw_ts_recent = tp->rx_opt.ts_recent; tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; tcptw->tw_ts_offset = tp->tsoffset; tw->tw_usec_ts = tp->tcp_usec_ts; tcptw->tw_last_oow_ack_time = 0; tcptw->tw_tx_delay = tp->tcp_tx_delay; tw->tw_txhash = sk->sk_txhash; tw->tw_tx_queue_mapping = sk->sk_tx_queue_mapping; #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING tw->tw_rx_queue_mapping = sk->sk_rx_queue_mapping; #endif #if IS_ENABLED(CONFIG_IPV6) if (tw->tw_family == PF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); tw->tw_v6_daddr = sk->sk_v6_daddr; tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr; tw->tw_tclass = np->tclass; tw->tw_flowlabel = be32_to_cpu(np->flow_label & IPV6_FLOWLABEL_MASK); tw->tw_ipv6only = sk->sk_ipv6only; } #endif tcp_time_wait_init(sk, tcptw); tcp_ao_time_wait(tcptw, tp); /* Get the TIME_WAIT timeout firing. */ if (timeo < rto) timeo = rto; if (state == TCP_TIME_WAIT) timeo = TCP_TIMEWAIT_LEN; /* Linkage updates. * Note that access to tw after this point is illegal. */ inet_twsk_hashdance_schedule(tw, sk, net->ipv4.tcp_death_row.hashinfo, timeo); } else { /* Sorry, if we're out of memory, just CLOSE this * socket up. We've got bigger problems than * non-graceful socket closings. */ NET_INC_STATS(net, LINUX_MIB_TCPTIMEWAITOVERFLOW); } tcp_update_metrics(sk); tcp_done(sk); } EXPORT_SYMBOL(tcp_time_wait); #ifdef CONFIG_TCP_MD5SIG static void tcp_md5_twsk_free_rcu(struct rcu_head *head) { struct tcp_md5sig_key *key; key = container_of(head, struct tcp_md5sig_key, rcu); kfree(key); static_branch_slow_dec_deferred(&tcp_md5_needed); tcp_md5_release_sigpool(); } #endif void tcp_twsk_destructor(struct sock *sk) { #ifdef CONFIG_TCP_MD5SIG if (static_branch_unlikely(&tcp_md5_needed.key)) { struct tcp_timewait_sock *twsk = tcp_twsk(sk); if (twsk->tw_md5_key) call_rcu(&twsk->tw_md5_key->rcu, tcp_md5_twsk_free_rcu); } #endif tcp_ao_destroy_sock(sk, true); } EXPORT_IPV6_MOD_GPL(tcp_twsk_destructor); void tcp_twsk_purge(struct list_head *net_exit_list) { bool purged_once = false; struct net *net; list_for_each_entry(net, net_exit_list, exit_list) { if (net->ipv4.tcp_death_row.hashinfo->pernet) { /* Even if tw_refcount == 1, we must clean up kernel reqsk */ inet_twsk_purge(net->ipv4.tcp_death_row.hashinfo); } else if (!purged_once) { inet_twsk_purge(&tcp_hashinfo); purged_once = true; } } } /* Warning : This function is called without sk_listener being locked. * Be sure to read socket fields once, as their value could change under us. */ void tcp_openreq_init_rwin(struct request_sock *req, const struct sock *sk_listener, const struct dst_entry *dst) { struct inet_request_sock *ireq = inet_rsk(req); const struct tcp_sock *tp = tcp_sk(sk_listener); int full_space = tcp_full_space(sk_listener); u32 window_clamp; __u8 rcv_wscale; u32 rcv_wnd; int mss; mss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); window_clamp = READ_ONCE(tp->window_clamp); /* Set this up on the first call only */ req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW); /* limit the window selection if the user enforce a smaller rx buffer */ if (sk_listener->sk_userlocks & SOCK_RCVBUF_LOCK && (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) req->rsk_window_clamp = full_space; rcv_wnd = tcp_rwnd_init_bpf((struct sock *)req); if (rcv_wnd == 0) rcv_wnd = dst_metric(dst, RTAX_INITRWND); else if (full_space < rcv_wnd * mss) full_space = rcv_wnd * mss; /* tcp_full_space because it is guaranteed to be the first packet */ tcp_select_initial_window(sk_listener, full_space, mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, rcv_wnd); ireq->rcv_wscale = rcv_wscale; } static void tcp_ecn_openreq_child(struct tcp_sock *tp, const struct request_sock *req) { tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ? TCP_ECN_MODE_RFC3168 : TCP_ECN_DISABLED); } void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) { struct inet_connection_sock *icsk = inet_csk(sk); u32 ca_key = dst_metric(dst, RTAX_CC_ALGO); bool ca_got_dst = false; if (ca_key != TCP_CA_UNSPEC) { const struct tcp_congestion_ops *ca; rcu_read_lock(); ca = tcp_ca_find_key(ca_key); if (likely(ca && bpf_try_module_get(ca, ca->owner))) { icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst); icsk->icsk_ca_ops = ca; ca_got_dst = true; } rcu_read_unlock(); } /* If no valid choice made yet, assign current system default ca. */ if (!ca_got_dst && (!icsk->icsk_ca_setsockopt || !bpf_try_module_get(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner))) tcp_assign_congestion_control(sk); tcp_set_ca_state(sk, TCP_CA_Open); } EXPORT_IPV6_MOD_GPL(tcp_ca_openreq_child); static void smc_check_reset_syn_req(const struct tcp_sock *oldtp, struct request_sock *req, struct tcp_sock *newtp) { #if IS_ENABLED(CONFIG_SMC) struct inet_request_sock *ireq; if (static_branch_unlikely(&tcp_have_smc)) { ireq = inet_rsk(req); if (oldtp->syn_smc && !ireq->smc_ok) newtp->syn_smc = 0; } #endif } /* This is not only more efficient than what we used to do, it eliminates * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM * * Actually, we could lots of memory writes here. tp of listening * socket contains all necessary default parameters. */ struct sock *tcp_create_openreq_child(const struct sock *sk, struct request_sock *req, struct sk_buff *skb) { struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC); const struct inet_request_sock *ireq = inet_rsk(req); struct tcp_request_sock *treq = tcp_rsk(req); struct inet_connection_sock *newicsk; const struct tcp_sock *oldtp; struct tcp_sock *newtp; u32 seq; if (!newsk) return NULL; newicsk = inet_csk(newsk); newtp = tcp_sk(newsk); oldtp = tcp_sk(sk); smc_check_reset_syn_req(oldtp, req, newtp); /* Now setup tcp_sock */ newtp->pred_flags = 0; seq = treq->rcv_isn + 1; newtp->rcv_wup = seq; WRITE_ONCE(newtp->copied_seq, seq); WRITE_ONCE(newtp->rcv_nxt, seq); newtp->segs_in = 1; seq = treq->snt_isn + 1; newtp->snd_sml = newtp->snd_una = seq; WRITE_ONCE(newtp->snd_nxt, seq); newtp->snd_up = seq; INIT_LIST_HEAD(&newtp->tsq_node); INIT_LIST_HEAD(&newtp->tsorted_sent_queue); tcp_init_wl(newtp, treq->rcv_isn); minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U); newicsk->icsk_ack.lrcvtime = tcp_jiffies32; newtp->lsndtime = tcp_jiffies32; newsk->sk_txhash = READ_ONCE(treq->txhash); newtp->total_retrans = req->num_retrans; tcp_init_xmit_timers(newsk); WRITE_ONCE(newtp->write_seq, newtp->pushed_seq = treq->snt_isn + 1); if (sock_flag(newsk, SOCK_KEEPOPEN)) tcp_reset_keepalive_timer(newsk, keepalive_time_when(newtp)); newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; newtp->rx_opt.sack_ok = ireq->sack_ok; newtp->window_clamp = req->rsk_window_clamp; newtp->rcv_ssthresh = req->rsk_rcv_wnd; newtp->rcv_wnd = req->rsk_rcv_wnd; newtp->rx_opt.wscale_ok = ireq->wscale_ok; if (newtp->rx_opt.wscale_ok) { newtp->rx_opt.snd_wscale = ireq->snd_wscale; newtp->rx_opt.rcv_wscale = ireq->rcv_wscale; } else { newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0; newtp->window_clamp = min(newtp->window_clamp, 65535U); } newtp->snd_wnd = ntohs(tcp_hdr(skb)->window) << newtp->rx_opt.snd_wscale; newtp->max_window = newtp->snd_wnd; if (newtp->rx_opt.tstamp_ok) { newtp->tcp_usec_ts = treq->req_usec_ts; newtp->rx_opt.ts_recent = req->ts_recent; newtp->rx_opt.ts_recent_stamp = ktime_get_seconds(); newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; } else { newtp->tcp_usec_ts = 0; newtp->rx_opt.ts_recent_stamp = 0; newtp->tcp_header_len = sizeof(struct tcphdr); } if (req->num_timeout) { newtp->total_rto = req->num_timeout; newtp->undo_marker = treq->snt_isn; if (newtp->tcp_usec_ts) { newtp->retrans_stamp = treq->snt_synack; newtp->total_rto_time = (u32)(tcp_clock_us() - newtp->retrans_stamp) / USEC_PER_MSEC; } else { newtp->retrans_stamp = div_u64(treq->snt_synack, USEC_PER_SEC / TCP_TS_HZ); newtp->total_rto_time = tcp_clock_ms() - newtp->retrans_stamp; } newtp->total_rto_recoveries = 1; } newtp->tsoffset = treq->ts_off; #ifdef CONFIG_TCP_MD5SIG newtp->md5sig_info = NULL; /*XXX*/ #endif #ifdef CONFIG_TCP_AO newtp->ao_info = NULL; if (tcp_rsk_used_ao(req)) { struct tcp_ao_key *ao_key; ao_key = treq->af_specific->ao_lookup(sk, req, tcp_rsk(req)->ao_keyid, -1); if (ao_key) newtp->tcp_header_len += tcp_ao_len_aligned(ao_key); } #endif if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; newtp->rx_opt.mss_clamp = req->mss; tcp_ecn_openreq_child(newtp, req); newtp->fastopen_req = NULL; RCU_INIT_POINTER(newtp->fastopen_rsk, NULL); newtp->bpf_chg_cc_inprogress = 0; tcp_bpf_clone(sk, newsk); __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); xa_init_flags(&newsk->sk_user_frags, XA_FLAGS_ALLOC1); return newsk; } EXPORT_SYMBOL(tcp_create_openreq_child); /* * Process an incoming packet for SYN_RECV sockets represented as a * request_sock. Normally sk is the listener socket but for TFO it * points to the child socket. * * XXX (TFO) - The current impl contains a special check for ack * validation and inside tcp_v4_reqsk_send_ack(). Can we do better? * * We don't need to initialize tmp_opt.sack_ok as we don't use the results * * Note: If @fastopen is true, this can be called from process context. * Otherwise, this is from BH context. */ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, bool fastopen, bool *req_stolen, enum skb_drop_reason *drop_reason) { struct tcp_options_received tmp_opt; struct sock *child; const struct tcphdr *th = tcp_hdr(skb); __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); bool tsecr_reject = false; bool paws_reject = false; bool own_req; tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(struct tcphdr)>>2)) { tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL); if (tmp_opt.saw_tstamp) { tmp_opt.ts_recent = req->ts_recent; if (tmp_opt.rcv_tsecr) { if (inet_rsk(req)->tstamp_ok && !fastopen) tsecr_reject = !between(tmp_opt.rcv_tsecr, tcp_rsk(req)->snt_tsval_first, READ_ONCE(tcp_rsk(req)->snt_tsval_last)); tmp_opt.rcv_tsecr -= tcp_rsk(req)->ts_off; } /* We do not store true stamp, but it is not required, * it can be estimated (approximately) * from another data. */ tmp_opt.ts_recent_stamp = ktime_get_seconds() - reqsk_timeout(req, TCP_RTO_MAX) / HZ; paws_reject = tcp_paws_reject(&tmp_opt, th->rst); } } /* Check for pure retransmitted SYN. */ if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn && flg == TCP_FLAG_SYN && !paws_reject) { /* * RFC793 draws (Incorrectly! It was fixed in RFC1122) * this case on figure 6 and figure 8, but formal * protocol description says NOTHING. * To be more exact, it says that we should send ACK, * because this segment (at least, if it has no data) * is out of window. * * CONCLUSION: RFC793 (even with RFC1122) DOES NOT * describe SYN-RECV state. All the description * is wrong, we cannot believe to it and should * rely only on common sense and implementation * experience. * * Enforce "SYN-ACK" according to figure 8, figure 6 * of RFC793, fixed by RFC1122. * * Note that even if there is new data in the SYN packet * they will be thrown away too. * * Reset timer after retransmitting SYNACK, similar to * the idea of fast retransmit in recovery. */ if (!tcp_oow_rate_limited(sock_net(sk), skb, LINUX_MIB_TCPACKSKIPPEDSYNRECV, &tcp_rsk(req)->last_oow_ack_time) && !inet_rtx_syn_ack(sk, req)) { unsigned long expires = jiffies; expires += reqsk_timeout(req, TCP_RTO_MAX); if (!fastopen) mod_timer_pending(&req->rsk_timer, expires); else req->rsk_timer.expires = expires; } return NULL; } /* Further reproduces section "SEGMENT ARRIVES" for state SYN-RECEIVED of RFC793. It is broken, however, it does not work only when SYNs are crossed. You would think that SYN crossing is impossible here, since we should have a SYN_SENT socket (from connect()) on our end, but this is not true if the crossed SYNs were sent to both ends by a malicious third party. We must defend against this, and to do that we first verify the ACK (as per RFC793, page 36) and reset if it is invalid. Is this a true full defense? To convince ourselves, let us consider a way in which the ACK test can still pass in this 'malicious crossed SYNs' case. Malicious sender sends identical SYNs (and thus identical sequence numbers) to both A and B: A: gets SYN, seq=7 B: gets SYN, seq=7 By our good fortune, both A and B select the same initial send sequence number of seven :-) A: sends SYN|ACK, seq=7, ack_seq=8 B: sends SYN|ACK, seq=7, ack_seq=8 So we are now A eating this SYN|ACK, ACK test passes. So does sequence test, SYN is truncated, and thus we consider it a bare ACK. If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this bare ACK. Otherwise, we create an established connection. Both ends (listening sockets) accept the new incoming connection and try to talk to each other. 8-) Note: This case is both harmless, and rare. Possibility is about the same as us discovering intelligent life on another plant tomorrow. But generally, we should (RFC lies!) to accept ACK from SYNACK both here and in tcp_rcv_state_process(). tcp_rcv_state_process() does not, hence, we do not too. Note that the case is absolutely generic: we cannot optimize anything here without violating protocol. All the checks must be made before attempt to create socket. */ /* RFC793 page 36: "If the connection is in any non-synchronized state ... * and the incoming segment acknowledges something not yet * sent (the segment carries an unacceptable ACK) ... * a reset is sent." * * Invalid ACK: reset will be sent by listening socket. * Note that the ACK validity check for a Fast Open socket is done * elsewhere and is checked directly against the child socket rather * than req because user data may have been sent out. */ if ((flg & TCP_FLAG_ACK) && !fastopen && (TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1)) return sk; /* RFC793: "first check sequence number". */ if (paws_reject || tsecr_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + tcp_synack_window(req))) { /* Out of window: send ACK and drop. */ if (!(flg & TCP_FLAG_RST) && !tcp_oow_rate_limited(sock_net(sk), skb, LINUX_MIB_TCPACKSKIPPEDSYNRECV, &tcp_rsk(req)->last_oow_ack_time)) req->rsk_ops->send_ack(sk, skb, req); if (paws_reject) { SKB_DR_SET(*drop_reason, TCP_RFC7323_PAWS); NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); } else if (tsecr_reject) { SKB_DR_SET(*drop_reason, TCP_RFC7323_TSECR); NET_INC_STATS(sock_net(sk), LINUX_MIB_TSECRREJECTED); } else { SKB_DR_SET(*drop_reason, TCP_OVERWINDOW); } return NULL; } /* In sequence, PAWS is OK. */ if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { /* Truncate SYN, it is out of window starting at tcp_rsk(req)->rcv_isn + 1. */ flg &= ~TCP_FLAG_SYN; } /* RFC793: "second check the RST bit" and * "fourth, check the SYN bit" */ if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) { TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS); goto embryonic_reset; } /* ACK sequence verified above, just make sure ACK is * set. If ACK not set, just silently drop the packet. * * XXX (TFO) - if we ever allow "data after SYN", the * following check needs to be removed. */ if (!(flg & TCP_FLAG_ACK)) return NULL; /* For Fast Open no more processing is needed (sk is the * child socket). */ if (fastopen) return sk; /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ if (req->num_timeout < READ_ONCE(inet_csk(sk)->icsk_accept_queue.rskq_defer_accept) && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { inet_rsk(req)->acked = 1; __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP); return NULL; } /* OK, ACK is valid, create big socket and * feed this segment to it. It will repeat all * the tests. THIS SEGMENT MUST MOVE SOCKET TO * ESTABLISHED STATE. If it will be dropped after * socket is created, wait for troubles. */ child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, req, &own_req); if (!child) goto listen_overflow; if (own_req && tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt)) tcp_sk(child)->rx_opt.ts_recent = tmp_opt.rcv_tsval; if (own_req && rsk_drop_req(req)) { reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req); inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req); return child; } sock_rps_save_rxhash(child, skb); tcp_synack_rtt_meas(child, req); *req_stolen = !own_req; return inet_csk_complete_hashdance(sk, child, req, own_req); listen_overflow: SKB_DR_SET(*drop_reason, TCP_LISTEN_OVERFLOW); if (sk != req->rsk_listener) __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow)) { inet_rsk(req)->acked = 1; return NULL; } embryonic_reset: if (!(flg & TCP_FLAG_RST)) { /* Received a bad SYN pkt - for TFO We try not to reset * the local connection unless it's really necessary to * avoid becoming vulnerable to outside attack aiming at * resetting legit local connections. */ req->rsk_ops->send_reset(sk, skb, SK_RST_REASON_INVALID_SYN); } else if (fastopen) { /* received a valid RST pkt */ reqsk_fastopen_remove(sk, req, true); tcp_reset(sk, skb); } if (!fastopen) { bool unlinked = inet_csk_reqsk_queue_drop(sk, req); if (unlinked) __NET_INC_STATS(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); *req_stolen = !unlinked; } return NULL; } EXPORT_IPV6_MOD(tcp_check_req); /* * Queue segment on the new socket if the new socket is active, * otherwise we just shortcircuit this and continue with * the new socket. * * For the vast majority of cases child->sk_state will be TCP_SYN_RECV * when entering. But other states are possible due to a race condition * where after __inet_lookup_established() fails but before the listener * locked is obtained, other packets cause the same connection to * be created. */ enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child, struct sk_buff *skb) __releases(&((child)->sk_lock.slock)) { enum skb_drop_reason reason = SKB_NOT_DROPPED_YET; int state = child->sk_state; /* record sk_napi_id and sk_rx_queue_mapping of child. */ sk_mark_napi_id_set(child, skb); tcp_segs_in(tcp_sk(child), skb); if (!sock_owned_by_user(child)) { reason = tcp_rcv_state_process(child, skb); /* Wakeup parent, send SIGIO */ if (state == TCP_SYN_RECV && child->sk_state != state) parent->sk_data_ready(parent); } else { /* Alas, it is possible again, because we do lookup * in main socket hash table and lock on listening * socket does not protect us more. */ __sk_add_backlog(child, skb); } bh_unlock_sock(child); sock_put(child); return reason; } EXPORT_IPV6_MOD(tcp_child_process);
48 36 49 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 #ifndef LLC_H #define LLC_H /* * Copyright (c) 1997 by Procom Technology, Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <linux/if.h> #include <linux/if_ether.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/rculist_nulls.h> #include <linux/hash.h> #include <linux/jhash.h> #include <linux/atomic.h> struct net_device; struct packet_type; struct sk_buff; struct llc_addr { unsigned char lsap; unsigned char mac[IFHWADDRLEN]; }; #define LLC_SAP_STATE_INACTIVE 1 #define LLC_SAP_STATE_ACTIVE 2 #define LLC_SK_DEV_HASH_BITS 6 #define LLC_SK_DEV_HASH_ENTRIES (1<<LLC_SK_DEV_HASH_BITS) #define LLC_SK_LADDR_HASH_BITS 6 #define LLC_SK_LADDR_HASH_ENTRIES (1<<LLC_SK_LADDR_HASH_BITS) /** * struct llc_sap - Defines the SAP component * * @station - station this sap belongs to * @state - sap state * @p_bit - only lowest-order bit used * @f_bit - only lowest-order bit used * @laddr - SAP value in this 'lsap' * @node - entry in station sap_list * @sk_list - LLC sockets this one manages */ struct llc_sap { unsigned char state; unsigned char p_bit; unsigned char f_bit; refcount_t refcnt; int (*rcv_func)(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); struct llc_addr laddr; struct list_head node; spinlock_t sk_lock; int sk_count; struct hlist_nulls_head sk_laddr_hash[LLC_SK_LADDR_HASH_ENTRIES]; struct hlist_head sk_dev_hash[LLC_SK_DEV_HASH_ENTRIES]; struct rcu_head rcu; }; static inline struct hlist_head *llc_sk_dev_hash(struct llc_sap *sap, int ifindex) { u32 bucket = hash_32(ifindex, LLC_SK_DEV_HASH_BITS); return &sap->sk_dev_hash[bucket]; } static inline u32 llc_sk_laddr_hashfn(struct llc_sap *sap, const struct llc_addr *laddr) { return hash_32(jhash(laddr->mac, sizeof(laddr->mac), 0), LLC_SK_LADDR_HASH_BITS); } static inline struct hlist_nulls_head *llc_sk_laddr_hash(struct llc_sap *sap, const struct llc_addr *laddr) { return &sap->sk_laddr_hash[llc_sk_laddr_hashfn(sap, laddr)]; } #define LLC_DEST_INVALID 0 /* Invalid LLC PDU type */ #define LLC_DEST_SAP 1 /* Type 1 goes here */ #define LLC_DEST_CONN 2 /* Type 2 goes here */ extern struct list_head llc_sap_list; int llc_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); int llc_mac_hdr_init(struct sk_buff *skb, const unsigned char *sa, const unsigned char *da); void llc_add_pack(int type, void (*handler)(struct llc_sap *sap, struct sk_buff *skb)); void llc_remove_pack(int type); void llc_set_station_handler(void (*handler)(struct sk_buff *skb)); struct llc_sap *llc_sap_open(unsigned char lsap, int (*rcv)(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)); static inline void llc_sap_hold(struct llc_sap *sap) { refcount_inc(&sap->refcnt); } static inline bool llc_sap_hold_safe(struct llc_sap *sap) { return refcount_inc_not_zero(&sap->refcnt); } void llc_sap_close(struct llc_sap *sap); static inline void llc_sap_put(struct llc_sap *sap) { if (refcount_dec_and_test(&sap->refcnt)) llc_sap_close(sap); } struct llc_sap *llc_sap_find(unsigned char sap_value); int llc_build_and_send_ui_pkt(struct llc_sap *sap, struct sk_buff *skb, const unsigned char *dmac, unsigned char dsap); void llc_sap_handler(struct llc_sap *sap, struct sk_buff *skb); void llc_conn_handler(struct llc_sap *sap, struct sk_buff *skb); void llc_station_init(void); void llc_station_exit(void); #ifdef CONFIG_PROC_FS int llc_proc_init(void); void llc_proc_exit(void); #else #define llc_proc_init() (0) #define llc_proc_exit() do { } while(0) #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_SYSCTL int llc_sysctl_init(void); void llc_sysctl_exit(void); extern int sysctl_llc2_ack_timeout; extern int sysctl_llc2_busy_timeout; extern int sysctl_llc2_p_timeout; extern int sysctl_llc2_rej_timeout; #else #define llc_sysctl_init() (0) #define llc_sysctl_exit() do { } while(0) #endif /* CONFIG_SYSCTL */ #endif /* LLC_H */
8 7 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 // SPDX-License-Identifier: GPL-2.0-only /* * Creates audit record for dropped/accepted packets * * (C) 2010-2011 Thomas Graf <tgraf@redhat.com> * (C) 2010-2011 Red Hat, Inc. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/audit.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/if_arp.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_AUDIT.h> #include <linux/netfilter_bridge/ebtables.h> #include <net/ipv6.h> #include <net/ip.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Thomas Graf <tgraf@redhat.com>"); MODULE_DESCRIPTION("Xtables: creates audit records for dropped/accepted packets"); MODULE_ALIAS("ipt_AUDIT"); MODULE_ALIAS("ip6t_AUDIT"); MODULE_ALIAS("ebt_AUDIT"); MODULE_ALIAS("arpt_AUDIT"); static bool audit_ip4(struct audit_buffer *ab, struct sk_buff *skb) { struct iphdr _iph; const struct iphdr *ih; ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_iph), &_iph); if (!ih) return false; audit_log_format(ab, " saddr=%pI4 daddr=%pI4 proto=%hhu", &ih->saddr, &ih->daddr, ih->protocol); return true; } static bool audit_ip6(struct audit_buffer *ab, struct sk_buff *skb) { struct ipv6hdr _ip6h; const struct ipv6hdr *ih; u8 nexthdr; __be16 frag_off; ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_ip6h), &_ip6h); if (!ih) return false; nexthdr = ih->nexthdr; ipv6_skip_exthdr(skb, skb_network_offset(skb) + sizeof(_ip6h), &nexthdr, &frag_off); audit_log_format(ab, " saddr=%pI6c daddr=%pI6c proto=%hhu", &ih->saddr, &ih->daddr, nexthdr); return true; } static unsigned int audit_tg(struct sk_buff *skb, const struct xt_action_param *par) { struct audit_buffer *ab; int fam = -1; if (audit_enabled == AUDIT_OFF) goto errout; ab = audit_log_start(NULL, GFP_ATOMIC, AUDIT_NETFILTER_PKT); if (ab == NULL) goto errout; audit_log_format(ab, "mark=%#x", skb->mark); switch (xt_family(par)) { case NFPROTO_BRIDGE: switch (eth_hdr(skb)->h_proto) { case htons(ETH_P_IP): fam = audit_ip4(ab, skb) ? NFPROTO_IPV4 : -1; break; case htons(ETH_P_IPV6): fam = audit_ip6(ab, skb) ? NFPROTO_IPV6 : -1; break; } break; case NFPROTO_IPV4: fam = audit_ip4(ab, skb) ? NFPROTO_IPV4 : -1; break; case NFPROTO_IPV6: fam = audit_ip6(ab, skb) ? NFPROTO_IPV6 : -1; break; } if (fam == -1) audit_log_format(ab, " saddr=? daddr=? proto=-1"); audit_log_end(ab); errout: return XT_CONTINUE; } static unsigned int audit_tg_ebt(struct sk_buff *skb, const struct xt_action_param *par) { audit_tg(skb, par); return EBT_CONTINUE; } static int audit_tg_check(const struct xt_tgchk_param *par) { const struct xt_audit_info *info = par->targinfo; if (info->type > XT_AUDIT_TYPE_MAX) { pr_info_ratelimited("Audit type out of range (valid range: 0..%u)\n", XT_AUDIT_TYPE_MAX); return -ERANGE; } return 0; } static struct xt_target audit_tg_reg[] __read_mostly = { { .name = "AUDIT", .family = NFPROTO_UNSPEC, .target = audit_tg, .targetsize = sizeof(struct xt_audit_info), .checkentry = audit_tg_check, .me = THIS_MODULE, }, { .name = "AUDIT", .family = NFPROTO_BRIDGE, .target = audit_tg_ebt, .targetsize = sizeof(struct xt_audit_info), .checkentry = audit_tg_check, .me = THIS_MODULE, }, }; static int __init audit_tg_init(void) { return xt_register_targets(audit_tg_reg, ARRAY_SIZE(audit_tg_reg)); } static void __exit audit_tg_exit(void) { xt_unregister_targets(audit_tg_reg, ARRAY_SIZE(audit_tg_reg)); } module_init(audit_tg_init); module_exit(audit_tg_exit);
18 18 18 25 26 26 18 18 18 18 18 18 18 18 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2016 Mellanox Technologies. All rights reserved. * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> */ #include "devl_internal.h" /** * struct devlink_resource - devlink resource * @name: name of the resource * @id: id, per devlink instance * @size: size of the resource * @size_new: updated size of the resource, reload is needed * @size_valid: valid in case the total size of the resource is valid * including its children * @parent: parent resource * @size_params: size parameters * @list: parent list * @resource_list: list of child resources * @occ_get: occupancy getter callback * @occ_get_priv: occupancy getter callback priv */ struct devlink_resource { const char *name; u64 id; u64 size; u64 size_new; bool size_valid; struct devlink_resource *parent; struct devlink_resource_size_params size_params; struct list_head list; struct list_head resource_list; devlink_resource_occ_get_t *occ_get; void *occ_get_priv; }; static struct devlink_resource * devlink_resource_find(struct devlink *devlink, struct devlink_resource *resource, u64 resource_id) { struct list_head *resource_list; if (resource) resource_list = &resource->resource_list; else resource_list = &devlink->resource_list; list_for_each_entry(resource, resource_list, list) { struct devlink_resource *child_resource; if (resource->id == resource_id) return resource; child_resource = devlink_resource_find(devlink, resource, resource_id); if (child_resource) return child_resource; } return NULL; } static void devlink_resource_validate_children(struct devlink_resource *resource) { struct devlink_resource *child_resource; bool size_valid = true; u64 parts_size = 0; if (list_empty(&resource->resource_list)) goto out; list_for_each_entry(child_resource, &resource->resource_list, list) parts_size += child_resource->size_new; if (parts_size > resource->size_new) size_valid = false; out: resource->size_valid = size_valid; } static int devlink_resource_validate_size(struct devlink_resource *resource, u64 size, struct netlink_ext_ack *extack) { u64 reminder; int err = 0; if (size > resource->size_params.size_max) { NL_SET_ERR_MSG(extack, "Size larger than maximum"); err = -EINVAL; } if (size < resource->size_params.size_min) { NL_SET_ERR_MSG(extack, "Size smaller than minimum"); err = -EINVAL; } div64_u64_rem(size, resource->size_params.size_granularity, &reminder); if (reminder) { NL_SET_ERR_MSG(extack, "Wrong granularity"); err = -EINVAL; } return err; } int devlink_nl_resource_set_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_resource *resource; u64 resource_id; u64 size; int err; if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_RESOURCE_ID) || GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_RESOURCE_SIZE)) return -EINVAL; resource_id = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_ID]); resource = devlink_resource_find(devlink, NULL, resource_id); if (!resource) return -EINVAL; size = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_SIZE]); err = devlink_resource_validate_size(resource, size, info->extack); if (err) return err; resource->size_new = size; devlink_resource_validate_children(resource); if (resource->parent) devlink_resource_validate_children(resource->parent); return 0; } static int devlink_resource_size_params_put(struct devlink_resource *resource, struct sk_buff *skb) { struct devlink_resource_size_params *size_params; size_params = &resource->size_params; if (devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_SIZE_GRAN, size_params->size_granularity) || devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_SIZE_MAX, size_params->size_max) || devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_SIZE_MIN, size_params->size_min) || nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_UNIT, size_params->unit)) return -EMSGSIZE; return 0; } static int devlink_resource_occ_put(struct devlink_resource *resource, struct sk_buff *skb) { if (!resource->occ_get) return 0; return devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_OCC, resource->occ_get(resource->occ_get_priv)); } static int devlink_resource_put(struct devlink *devlink, struct sk_buff *skb, struct devlink_resource *resource) { struct devlink_resource *child_resource; struct nlattr *child_resource_attr; struct nlattr *resource_attr; resource_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_RESOURCE); if (!resource_attr) return -EMSGSIZE; if (nla_put_string(skb, DEVLINK_ATTR_RESOURCE_NAME, resource->name) || devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_SIZE, resource->size) || devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_ID, resource->id)) goto nla_put_failure; if (resource->size != resource->size_new && devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_SIZE_NEW, resource->size_new)) goto nla_put_failure; if (devlink_resource_occ_put(resource, skb)) goto nla_put_failure; if (devlink_resource_size_params_put(resource, skb)) goto nla_put_failure; if (list_empty(&resource->resource_list)) goto out; if (nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_SIZE_VALID, resource->size_valid)) goto nla_put_failure; child_resource_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_RESOURCE_LIST); if (!child_resource_attr) goto nla_put_failure; list_for_each_entry(child_resource, &resource->resource_list, list) { if (devlink_resource_put(devlink, skb, child_resource)) goto resource_put_failure; } nla_nest_end(skb, child_resource_attr); out: nla_nest_end(skb, resource_attr); return 0; resource_put_failure: nla_nest_cancel(skb, child_resource_attr); nla_put_failure: nla_nest_cancel(skb, resource_attr); return -EMSGSIZE; } static int devlink_resource_fill(struct genl_info *info, enum devlink_command cmd, int flags) { struct devlink *devlink = info->user_ptr[0]; struct devlink_resource *resource; struct nlattr *resources_attr; struct sk_buff *skb = NULL; struct nlmsghdr *nlh; bool incomplete; void *hdr; int i; int err; resource = list_first_entry(&devlink->resource_list, struct devlink_resource, list); start_again: err = devlink_nl_msg_reply_and_new(&skb, info); if (err) return err; hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, &devlink_nl_family, NLM_F_MULTI, cmd); if (!hdr) { nlmsg_free(skb); return -EMSGSIZE; } if (devlink_nl_put_handle(skb, devlink)) goto nla_put_failure; resources_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_RESOURCE_LIST); if (!resources_attr) goto nla_put_failure; incomplete = false; i = 0; list_for_each_entry_from(resource, &devlink->resource_list, list) { err = devlink_resource_put(devlink, skb, resource); if (err) { if (!i) goto err_resource_put; incomplete = true; break; } i++; } nla_nest_end(skb, resources_attr); genlmsg_end(skb, hdr); if (incomplete) goto start_again; send_done: nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq, NLMSG_DONE, 0, flags | NLM_F_MULTI); if (!nlh) { err = devlink_nl_msg_reply_and_new(&skb, info); if (err) return err; goto send_done; } return genlmsg_reply(skb, info); nla_put_failure: err = -EMSGSIZE; err_resource_put: nlmsg_free(skb); return err; } int devlink_nl_resource_dump_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; if (list_empty(&devlink->resource_list)) return -EOPNOTSUPP; return devlink_resource_fill(info, DEVLINK_CMD_RESOURCE_DUMP, 0); } int devlink_resources_validate(struct devlink *devlink, struct devlink_resource *resource, struct genl_info *info) { struct list_head *resource_list; int err = 0; if (resource) resource_list = &resource->resource_list; else resource_list = &devlink->resource_list; list_for_each_entry(resource, resource_list, list) { if (!resource->size_valid) return -EINVAL; err = devlink_resources_validate(devlink, resource, info); if (err) return err; } return err; } /** * devl_resource_register - devlink resource register * * @devlink: devlink * @resource_name: resource's name * @resource_size: resource's size * @resource_id: resource's id * @parent_resource_id: resource's parent id * @size_params: size parameters * * Generic resources should reuse the same names across drivers. * Please see the generic resources list at: * Documentation/networking/devlink/devlink-resource.rst */ int devl_resource_register(struct devlink *devlink, const char *resource_name, u64 resource_size, u64 resource_id, u64 parent_resource_id, const struct devlink_resource_size_params *size_params) { struct devlink_resource *resource; struct list_head *resource_list; bool top_hierarchy; lockdep_assert_held(&devlink->lock); top_hierarchy = parent_resource_id == DEVLINK_RESOURCE_ID_PARENT_TOP; resource = devlink_resource_find(devlink, NULL, resource_id); if (resource) return -EEXIST; resource = kzalloc(sizeof(*resource), GFP_KERNEL); if (!resource) return -ENOMEM; if (top_hierarchy) { resource_list = &devlink->resource_list; } else { struct devlink_resource *parent_resource; parent_resource = devlink_resource_find(devlink, NULL, parent_resource_id); if (parent_resource) { resource_list = &parent_resource->resource_list; resource->parent = parent_resource; } else { kfree(resource); return -EINVAL; } } resource->name = resource_name; resource->size = resource_size; resource->size_new = resource_size; resource->id = resource_id; resource->size_valid = true; memcpy(&resource->size_params, size_params, sizeof(resource->size_params)); INIT_LIST_HEAD(&resource->resource_list); list_add_tail(&resource->list, resource_list); return 0; } EXPORT_SYMBOL_GPL(devl_resource_register); static void devlink_resource_unregister(struct devlink *devlink, struct devlink_resource *resource) { struct devlink_resource *tmp, *child_resource; list_for_each_entry_safe(child_resource, tmp, &resource->resource_list, list) { devlink_resource_unregister(devlink, child_resource); list_del(&child_resource->list); kfree(child_resource); } } /** * devl_resources_unregister - free all resources * * @devlink: devlink */ void devl_resources_unregister(struct devlink *devlink) { struct devlink_resource *tmp, *child_resource; lockdep_assert_held(&devlink->lock); list_for_each_entry_safe(child_resource, tmp, &devlink->resource_list, list) { devlink_resource_unregister(devlink, child_resource); list_del(&child_resource->list); kfree(child_resource); } } EXPORT_SYMBOL_GPL(devl_resources_unregister); /** * devlink_resources_unregister - free all resources * * @devlink: devlink * * Context: Takes and release devlink->lock <mutex>. */ void devlink_resources_unregister(struct devlink *devlink) { devl_lock(devlink); devl_resources_unregister(devlink); devl_unlock(devlink); } EXPORT_SYMBOL_GPL(devlink_resources_unregister); /** * devl_resource_size_get - get and update size * * @devlink: devlink * @resource_id: the requested resource id * @p_resource_size: ptr to update */ int devl_resource_size_get(struct devlink *devlink, u64 resource_id, u64 *p_resource_size) { struct devlink_resource *resource; lockdep_assert_held(&devlink->lock); resource = devlink_resource_find(devlink, NULL, resource_id); if (!resource) return -EINVAL; *p_resource_size = resource->size_new; resource->size = resource->size_new; return 0; } EXPORT_SYMBOL_GPL(devl_resource_size_get); /** * devl_resource_occ_get_register - register occupancy getter * * @devlink: devlink * @resource_id: resource id * @occ_get: occupancy getter callback * @occ_get_priv: occupancy getter callback priv */ void devl_resource_occ_get_register(struct devlink *devlink, u64 resource_id, devlink_resource_occ_get_t *occ_get, void *occ_get_priv) { struct devlink_resource *resource; lockdep_assert_held(&devlink->lock); resource = devlink_resource_find(devlink, NULL, resource_id); if (WARN_ON(!resource)) return; WARN_ON(resource->occ_get); resource->occ_get = occ_get; resource->occ_get_priv = occ_get_priv; } EXPORT_SYMBOL_GPL(devl_resource_occ_get_register); /** * devl_resource_occ_get_unregister - unregister occupancy getter * * @devlink: devlink * @resource_id: resource id */ void devl_resource_occ_get_unregister(struct devlink *devlink, u64 resource_id) { struct devlink_resource *resource; lockdep_assert_held(&devlink->lock); resource = devlink_resource_find(devlink, NULL, resource_id); if (WARN_ON(!resource)) return; WARN_ON(!resource->occ_get); resource->occ_get = NULL; resource->occ_get_priv = NULL; } EXPORT_SYMBOL_GPL(devl_resource_occ_get_unregister);
4519 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 // SPDX-License-Identifier: GPL-2.0 #include <linux/err.h> #include <linux/bug.h> #include <linux/atomic.h> #include <linux/errseq.h> #include <linux/log2.h> /* * An errseq_t is a way of recording errors in one place, and allowing any * number of "subscribers" to tell whether it has changed since a previous * point where it was sampled. * * It's implemented as an unsigned 32-bit value. The low order bits are * designated to hold an error code (between 0 and -MAX_ERRNO). The upper bits * are used as a counter. This is done with atomics instead of locking so that * these functions can be called from any context. * * The general idea is for consumers to sample an errseq_t value. That value * can later be used to tell whether any new errors have occurred since that * sampling was done. * * Note that there is a risk of collisions if new errors are being recorded * frequently, since we have so few bits to use as a counter. * * To mitigate this, one bit is used as a flag to tell whether the value has * been sampled since a new value was recorded. That allows us to avoid bumping * the counter if no one has sampled it since the last time an error was * recorded. * * A new errseq_t should always be zeroed out. A errseq_t value of all zeroes * is the special (but common) case where there has never been an error. An all * zero value thus serves as the "epoch" if one wishes to know whether there * has ever been an error set since it was first initialized. */ /* The low bits are designated for error code (max of MAX_ERRNO) */ #define ERRSEQ_SHIFT ilog2(MAX_ERRNO + 1) /* This bit is used as a flag to indicate whether the value has been seen */ #define ERRSEQ_SEEN (1 << ERRSEQ_SHIFT) /* The lowest bit of the counter */ #define ERRSEQ_CTR_INC (1 << (ERRSEQ_SHIFT + 1)) /** * errseq_set - set a errseq_t for later reporting * @eseq: errseq_t field that should be set * @err: error to set (must be between -1 and -MAX_ERRNO) * * This function sets the error in @eseq, and increments the sequence counter * if the last sequence was sampled at some point in the past. * * Any error set will always overwrite an existing error. * * Return: The previous value, primarily for debugging purposes. The * return value should not be used as a previously sampled value in later * calls as it will not have the SEEN flag set. */ errseq_t errseq_set(errseq_t *eseq, int err) { errseq_t cur, old; /* MAX_ERRNO must be able to serve as a mask */ BUILD_BUG_ON_NOT_POWER_OF_2(MAX_ERRNO + 1); /* * Ensure the error code actually fits where we want it to go. If it * doesn't then just throw a warning and don't record anything. We * also don't accept zero here as that would effectively clear a * previous error. */ old = READ_ONCE(*eseq); if (WARN(unlikely(err == 0 || (unsigned int)-err > MAX_ERRNO), "err = %d\n", err)) return old; for (;;) { errseq_t new; /* Clear out error bits and set new error */ new = (old & ~(MAX_ERRNO|ERRSEQ_SEEN)) | -err; /* Only increment if someone has looked at it */ if (old & ERRSEQ_SEEN) new += ERRSEQ_CTR_INC; /* If there would be no change, then call it done */ if (new == old) { cur = new; break; } /* Try to swap the new value into place */ cur = cmpxchg(eseq, old, new); /* * Call it success if we did the swap or someone else beat us * to it for the same value. */ if (likely(cur == old || cur == new)) break; /* Raced with an update, try again */ old = cur; } return cur; } EXPORT_SYMBOL(errseq_set); /** * errseq_sample() - Grab current errseq_t value. * @eseq: Pointer to errseq_t to be sampled. * * This function allows callers to initialise their errseq_t variable. * If the error has been "seen", new callers will not see an old error. * If there is an unseen error in @eseq, the caller of this function will * see it the next time it checks for an error. * * Context: Any context. * Return: The current errseq value. */ errseq_t errseq_sample(errseq_t *eseq) { errseq_t old = READ_ONCE(*eseq); /* If nobody has seen this error yet, then we can be the first. */ if (!(old & ERRSEQ_SEEN)) old = 0; return old; } EXPORT_SYMBOL(errseq_sample); /** * errseq_check() - Has an error occurred since a particular sample point? * @eseq: Pointer to errseq_t value to be checked. * @since: Previously-sampled errseq_t from which to check. * * Grab the value that eseq points to, and see if it has changed @since * the given value was sampled. The @since value is not advanced, so there * is no need to mark the value as seen. * * Return: The latest error set in the errseq_t or 0 if it hasn't changed. */ int errseq_check(errseq_t *eseq, errseq_t since) { errseq_t cur = READ_ONCE(*eseq); if (likely(cur == since)) return 0; return -(cur & MAX_ERRNO); } EXPORT_SYMBOL(errseq_check); /** * errseq_check_and_advance() - Check an errseq_t and advance to current value. * @eseq: Pointer to value being checked and reported. * @since: Pointer to previously-sampled errseq_t to check against and advance. * * Grab the eseq value, and see whether it matches the value that @since * points to. If it does, then just return 0. * * If it doesn't, then the value has changed. Set the "seen" flag, and try to * swap it into place as the new eseq value. Then, set that value as the new * "since" value, and return whatever the error portion is set to. * * Note that no locking is provided here for concurrent updates to the "since" * value. The caller must provide that if necessary. Because of this, callers * may want to do a lockless errseq_check before taking the lock and calling * this. * * Return: Negative errno if one has been stored, or 0 if no new error has * occurred. */ int errseq_check_and_advance(errseq_t *eseq, errseq_t *since) { int err = 0; errseq_t old, new; /* * Most callers will want to use the inline wrapper to check this, * so that the common case of no error is handled without needing * to take the lock that protects the "since" value. */ old = READ_ONCE(*eseq); if (old != *since) { /* * Set the flag and try to swap it into place if it has * changed. * * We don't care about the outcome of the swap here. If the * swap doesn't occur, then it has either been updated by a * writer who is altering the value in some way (updating * counter or resetting the error), or another reader who is * just setting the "seen" flag. Either outcome is OK, and we * can advance "since" and return an error based on what we * have. */ new = old | ERRSEQ_SEEN; if (new != old) cmpxchg(eseq, old, new); *since = new; err = -(new & MAX_ERRNO); } return err; } EXPORT_SYMBOL(errseq_check_and_advance);
3 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 // SPDX-License-Identifier: GPL-2.0-only /* * iptables module for DCCP protocol header matching * * (C) 2005 by Harald Welte <laforge@netfilter.org> */ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <net/ip.h> #include <linux/dccp.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_dccp.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_DESCRIPTION("Xtables: DCCP protocol packet match"); MODULE_ALIAS("ipt_dccp"); MODULE_ALIAS("ip6t_dccp"); #define DCCHECK(cond, option, flag, invflag) (!((flag) & (option)) \ || (!!((invflag) & (option)) ^ (cond))) static unsigned char *dccp_optbuf; static DEFINE_SPINLOCK(dccp_buflock); static inline bool dccp_find_option(u_int8_t option, const struct sk_buff *skb, unsigned int protoff, const struct dccp_hdr *dh, bool *hotdrop) { /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */ const unsigned char *op; unsigned int optoff = __dccp_hdr_len(dh); unsigned int optlen = dh->dccph_doff*4 - __dccp_hdr_len(dh); unsigned int i; if (dh->dccph_doff * 4 < __dccp_hdr_len(dh)) goto invalid; if (!optlen) return false; spin_lock_bh(&dccp_buflock); op = skb_header_pointer(skb, protoff + optoff, optlen, dccp_optbuf); if (op == NULL) { /* If we don't have the whole header, drop packet. */ goto partial; } for (i = 0; i < optlen; ) { if (op[i] == option) { spin_unlock_bh(&dccp_buflock); return true; } if (op[i] < 2) i++; else i += op[i+1]?:1; } spin_unlock_bh(&dccp_buflock); return false; partial: spin_unlock_bh(&dccp_buflock); invalid: *hotdrop = true; return false; } static inline bool match_types(const struct dccp_hdr *dh, u_int16_t typemask) { return typemask & (1 << dh->dccph_type); } static inline bool match_option(u_int8_t option, const struct sk_buff *skb, unsigned int protoff, const struct dccp_hdr *dh, bool *hotdrop) { return dccp_find_option(option, skb, protoff, dh, hotdrop); } static bool dccp_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_dccp_info *info = par->matchinfo; const struct dccp_hdr *dh; struct dccp_hdr _dh; if (par->fragoff != 0) return false; dh = skb_header_pointer(skb, par->thoff, sizeof(_dh), &_dh); if (dh == NULL) { par->hotdrop = true; return false; } return DCCHECK(ntohs(dh->dccph_sport) >= info->spts[0] && ntohs(dh->dccph_sport) <= info->spts[1], XT_DCCP_SRC_PORTS, info->flags, info->invflags) && DCCHECK(ntohs(dh->dccph_dport) >= info->dpts[0] && ntohs(dh->dccph_dport) <= info->dpts[1], XT_DCCP_DEST_PORTS, info->flags, info->invflags) && DCCHECK(match_types(dh, info->typemask), XT_DCCP_TYPE, info->flags, info->invflags) && DCCHECK(match_option(info->option, skb, par->thoff, dh, &par->hotdrop), XT_DCCP_OPTION, info->flags, info->invflags); } static int dccp_mt_check(const struct xt_mtchk_param *par) { const struct xt_dccp_info *info = par->matchinfo; if (info->flags & ~XT_DCCP_VALID_FLAGS) return -EINVAL; if (info->invflags & ~XT_DCCP_VALID_FLAGS) return -EINVAL; if (info->invflags & ~info->flags) return -EINVAL; return 0; } static struct xt_match dccp_mt_reg[] __read_mostly = { { .name = "dccp", .family = NFPROTO_IPV4, .checkentry = dccp_mt_check, .match = dccp_mt, .matchsize = sizeof(struct xt_dccp_info), .proto = IPPROTO_DCCP, .me = THIS_MODULE, }, { .name = "dccp", .family = NFPROTO_IPV6, .checkentry = dccp_mt_check, .match = dccp_mt, .matchsize = sizeof(struct xt_dccp_info), .proto = IPPROTO_DCCP, .me = THIS_MODULE, }, }; static int __init dccp_mt_init(void) { int ret; /* doff is 8 bits, so the maximum option size is (4*256). Don't put * this in BSS since DaveM is worried about locked TLB's for kernel * BSS. */ dccp_optbuf = kmalloc(256 * 4, GFP_KERNEL); if (!dccp_optbuf) return -ENOMEM; ret = xt_register_matches(dccp_mt_reg, ARRAY_SIZE(dccp_mt_reg)); if (ret) goto out_kfree; return ret; out_kfree: kfree(dccp_optbuf); return ret; } static void __exit dccp_mt_exit(void) { xt_unregister_matches(dccp_mt_reg, ARRAY_SIZE(dccp_mt_reg)); kfree(dccp_optbuf); } module_init(dccp_mt_init); module_exit(dccp_mt_exit);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 /* SPDX-License-Identifier: GPL-2.0-only */ /* * linux/fs/pnode.h * * (C) Copyright IBM Corporation 2005. */ #ifndef _LINUX_PNODE_H #define _LINUX_PNODE_H #include <linux/list.h> #include "mount.h" #define IS_MNT_SHARED(m) ((m)->mnt.mnt_flags & MNT_SHARED) #define IS_MNT_SLAVE(m) ((m)->mnt_master) #define IS_MNT_PROPAGATED(m) (!(m)->mnt_ns || ((m)->mnt_ns->mntns_flags & MNTNS_PROPAGATING)) #define CLEAR_MNT_SHARED(m) ((m)->mnt.mnt_flags &= ~MNT_SHARED) #define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE) #define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED) #define SET_MNT_MARK(m) ((m)->mnt.mnt_flags |= MNT_MARKED) #define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED) #define IS_MNT_LOCKED(m) ((m)->mnt.mnt_flags & MNT_LOCKED) #define CL_EXPIRE 0x01 #define CL_SLAVE 0x02 #define CL_COPY_UNBINDABLE 0x04 #define CL_MAKE_SHARED 0x08 #define CL_PRIVATE 0x10 #define CL_SHARED_TO_SLAVE 0x20 #define CL_COPY_MNT_NS_FILE 0x40 #define CL_COPY_ALL (CL_COPY_UNBINDABLE | CL_COPY_MNT_NS_FILE) static inline void set_mnt_shared(struct mount *mnt) { mnt->mnt.mnt_flags &= ~MNT_SHARED_MASK; mnt->mnt.mnt_flags |= MNT_SHARED; } void change_mnt_propagation(struct mount *, int); int propagate_mnt(struct mount *, struct mountpoint *, struct mount *, struct hlist_head *); int propagate_umount(struct list_head *); int propagate_mount_busy(struct mount *, int); void propagate_mount_unlock(struct mount *); void mnt_release_group_id(struct mount *); int get_dominating_id(struct mount *mnt, const struct path *root); int mnt_get_count(struct mount *mnt); void mnt_set_mountpoint(struct mount *, struct mountpoint *, struct mount *); void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt); struct mount *copy_tree(struct mount *, struct dentry *, int); bool is_path_reachable(struct mount *, struct dentry *, const struct path *root); int count_mounts(struct mnt_namespace *ns, struct mount *mnt); bool propagation_would_overmount(const struct mount *from, const struct mount *to, const struct mountpoint *mp); #endif /* _LINUX_PNODE_H */
3021 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Global definitions for the Ethernet IEEE 802.3 interface. * * Version: @(#)if_ether.h 1.0.1a 02/08/94 * * Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Donald Becker, <becker@super.org> * Alan Cox, <alan@lxorguk.ukuu.org.uk> * Steve Whitehouse, <gw7rrm@eeshack3.swan.ac.uk> */ #ifndef _LINUX_IF_ETHER_H #define _LINUX_IF_ETHER_H #include <linux/skbuff.h> #include <uapi/linux/if_ether.h> /* XX:XX:XX:XX:XX:XX */ #define MAC_ADDR_STR_LEN (3 * ETH_ALEN - 1) static inline struct ethhdr *eth_hdr(const struct sk_buff *skb) { return (struct ethhdr *)skb_mac_header(skb); } /* Prefer this version in TX path, instead of * skb_reset_mac_header() + eth_hdr() */ static inline struct ethhdr *skb_eth_hdr(const struct sk_buff *skb) { return (struct ethhdr *)skb->data; } static inline struct ethhdr *inner_eth_hdr(const struct sk_buff *skb) { return (struct ethhdr *)skb_inner_mac_header(skb); } int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr); extern ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len); #endif /* _LINUX_IF_ETHER_H */
12 4 3 3 2 5 2 5 3 2 5 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org> */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_redirect.h> #include <net/netfilter/nf_tables.h> struct nft_redir { u8 sreg_proto_min; u8 sreg_proto_max; u16 flags; }; static const struct nla_policy nft_redir_policy[NFTA_REDIR_MAX + 1] = { [NFTA_REDIR_REG_PROTO_MIN] = { .type = NLA_U32 }, [NFTA_REDIR_REG_PROTO_MAX] = { .type = NLA_U32 }, [NFTA_REDIR_FLAGS] = NLA_POLICY_MASK(NLA_BE32, NF_NAT_RANGE_MASK), }; static int nft_redir_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { int err; err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT); if (err < 0) return err; return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT)); } static int nft_redir_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_redir *priv = nft_expr_priv(expr); unsigned int plen; int err; plen = sizeof_field(struct nf_nat_range, min_proto.all); if (tb[NFTA_REDIR_REG_PROTO_MIN]) { err = nft_parse_register_load(ctx, tb[NFTA_REDIR_REG_PROTO_MIN], &priv->sreg_proto_min, plen); if (err < 0) return err; if (tb[NFTA_REDIR_REG_PROTO_MAX]) { err = nft_parse_register_load(ctx, tb[NFTA_REDIR_REG_PROTO_MAX], &priv->sreg_proto_max, plen); if (err < 0) return err; } else { priv->sreg_proto_max = priv->sreg_proto_min; } priv->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } if (tb[NFTA_REDIR_FLAGS]) priv->flags = ntohl(nla_get_be32(tb[NFTA_REDIR_FLAGS])); return nf_ct_netns_get(ctx->net, ctx->family); } static int nft_redir_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_redir *priv = nft_expr_priv(expr); if (priv->sreg_proto_min) { if (nft_dump_register(skb, NFTA_REDIR_REG_PROTO_MIN, priv->sreg_proto_min)) goto nla_put_failure; if (nft_dump_register(skb, NFTA_REDIR_REG_PROTO_MAX, priv->sreg_proto_max)) goto nla_put_failure; } if (priv->flags != 0 && nla_put_be32(skb, NFTA_REDIR_FLAGS, htonl(priv->flags))) goto nla_put_failure; return 0; nla_put_failure: return -1; } static void nft_redir_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_redir *priv = nft_expr_priv(expr); struct nf_nat_range2 range; memset(&range, 0, sizeof(range)); range.flags = priv->flags; if (priv->sreg_proto_min) { range.min_proto.all = (__force __be16) nft_reg_load16(&regs->data[priv->sreg_proto_min]); range.max_proto.all = (__force __be16) nft_reg_load16(&regs->data[priv->sreg_proto_max]); } switch (nft_pf(pkt)) { case NFPROTO_IPV4: regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &range, nft_hook(pkt)); break; #ifdef CONFIG_NF_TABLES_IPV6 case NFPROTO_IPV6: regs->verdict.code = nf_nat_redirect_ipv6(pkt->skb, &range, nft_hook(pkt)); break; #endif default: WARN_ON_ONCE(1); break; } } static void nft_redir_ipv4_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { nf_ct_netns_put(ctx->net, NFPROTO_IPV4); } static struct nft_expr_type nft_redir_ipv4_type; static const struct nft_expr_ops nft_redir_ipv4_ops = { .type = &nft_redir_ipv4_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)), .eval = nft_redir_eval, .init = nft_redir_init, .destroy = nft_redir_ipv4_destroy, .dump = nft_redir_dump, .validate = nft_redir_validate, .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_redir_ipv4_type __read_mostly = { .family = NFPROTO_IPV4, .name = "redir", .ops = &nft_redir_ipv4_ops, .policy = nft_redir_policy, .maxattr = NFTA_REDIR_MAX, .owner = THIS_MODULE, }; #ifdef CONFIG_NF_TABLES_IPV6 static void nft_redir_ipv6_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { nf_ct_netns_put(ctx->net, NFPROTO_IPV6); } static struct nft_expr_type nft_redir_ipv6_type; static const struct nft_expr_ops nft_redir_ipv6_ops = { .type = &nft_redir_ipv6_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)), .eval = nft_redir_eval, .init = nft_redir_init, .destroy = nft_redir_ipv6_destroy, .dump = nft_redir_dump, .validate = nft_redir_validate, .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_redir_ipv6_type __read_mostly = { .family = NFPROTO_IPV6, .name = "redir", .ops = &nft_redir_ipv6_ops, .policy = nft_redir_policy, .maxattr = NFTA_REDIR_MAX, .owner = THIS_MODULE, }; #endif #ifdef CONFIG_NF_TABLES_INET static void nft_redir_inet_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { nf_ct_netns_put(ctx->net, NFPROTO_INET); } static struct nft_expr_type nft_redir_inet_type; static const struct nft_expr_ops nft_redir_inet_ops = { .type = &nft_redir_inet_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)), .eval = nft_redir_eval, .init = nft_redir_init, .destroy = nft_redir_inet_destroy, .dump = nft_redir_dump, .validate = nft_redir_validate, .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_redir_inet_type __read_mostly = { .family = NFPROTO_INET, .name = "redir", .ops = &nft_redir_inet_ops, .policy = nft_redir_policy, .maxattr = NFTA_REDIR_MAX, .owner = THIS_MODULE, }; static int __init nft_redir_module_init_inet(void) { return nft_register_expr(&nft_redir_inet_type); } #else static inline int nft_redir_module_init_inet(void) { return 0; } #endif static int __init nft_redir_module_init(void) { int ret = nft_register_expr(&nft_redir_ipv4_type); if (ret) return ret; #ifdef CONFIG_NF_TABLES_IPV6 ret = nft_register_expr(&nft_redir_ipv6_type); if (ret) { nft_unregister_expr(&nft_redir_ipv4_type); return ret; } #endif ret = nft_redir_module_init_inet(); if (ret < 0) { nft_unregister_expr(&nft_redir_ipv4_type); #ifdef CONFIG_NF_TABLES_IPV6 nft_unregister_expr(&nft_redir_ipv6_type); #endif return ret; } return ret; } static void __exit nft_redir_module_exit(void) { nft_unregister_expr(&nft_redir_ipv4_type); #ifdef CONFIG_NF_TABLES_IPV6 nft_unregister_expr(&nft_redir_ipv6_type); #endif #ifdef CONFIG_NF_TABLES_INET nft_unregister_expr(&nft_redir_inet_type); #endif } module_init(nft_redir_module_init); module_exit(nft_redir_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>"); MODULE_ALIAS_NFT_EXPR("redir"); MODULE_DESCRIPTION("Netfilter nftables redirect support");
8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* * Copyright (c) 2015, Sony Mobile Communications Inc. * Copyright (c) 2013, The Linux Foundation. All rights reserved. * Copyright (c) 2020, Linaro Ltd. */ #include <linux/module.h> #include <linux/qrtr.h> #include <linux/workqueue.h> #include <net/sock.h> #include "qrtr.h" #include <trace/events/sock.h> #define CREATE_TRACE_POINTS #include <trace/events/qrtr.h> static DEFINE_XARRAY(nodes); static struct { struct socket *sock; struct sockaddr_qrtr bcast_sq; struct list_head lookups; struct workqueue_struct *workqueue; struct work_struct work; int local_node; } qrtr_ns; static const char * const qrtr_ctrl_pkt_strings[] = { [QRTR_TYPE_HELLO] = "hello", [QRTR_TYPE_BYE] = "bye", [QRTR_TYPE_NEW_SERVER] = "new-server", [QRTR_TYPE_DEL_SERVER] = "del-server", [QRTR_TYPE_DEL_CLIENT] = "del-client", [QRTR_TYPE_RESUME_TX] = "resume-tx", [QRTR_TYPE_EXIT] = "exit", [QRTR_TYPE_PING] = "ping", [QRTR_TYPE_NEW_LOOKUP] = "new-lookup", [QRTR_TYPE_DEL_LOOKUP] = "del-lookup", }; struct qrtr_server_filter { unsigned int service; unsigned int instance; unsigned int ifilter; }; struct qrtr_lookup { unsigned int service; unsigned int instance; struct sockaddr_qrtr sq; struct list_head li; }; struct qrtr_server { unsigned int service; unsigned int instance; unsigned int node; unsigned int port; struct list_head qli; }; struct qrtr_node { unsigned int id; struct xarray servers; }; static struct qrtr_node *node_get(unsigned int node_id) { struct qrtr_node *node; node = xa_load(&nodes, node_id); if (node) return node; /* If node didn't exist, allocate and insert it to the tree */ node = kzalloc(sizeof(*node), GFP_KERNEL); if (!node) return NULL; node->id = node_id; xa_init(&node->servers); if (xa_store(&nodes, node_id, node, GFP_KERNEL)) { kfree(node); return NULL; } return node; } static int server_match(const struct qrtr_server *srv, const struct qrtr_server_filter *f) { unsigned int ifilter = f->ifilter; if (f->service != 0 && srv->service != f->service) return 0; if (!ifilter && f->instance) ifilter = ~0; return (srv->instance & ifilter) == f->instance; } static int service_announce_new(struct sockaddr_qrtr *dest, struct qrtr_server *srv) { struct qrtr_ctrl_pkt pkt; struct msghdr msg = { }; struct kvec iv; trace_qrtr_ns_service_announce_new(srv->service, srv->instance, srv->node, srv->port); iv.iov_base = &pkt; iv.iov_len = sizeof(pkt); memset(&pkt, 0, sizeof(pkt)); pkt.cmd = cpu_to_le32(QRTR_TYPE_NEW_SERVER); pkt.server.service = cpu_to_le32(srv->service); pkt.server.instance = cpu_to_le32(srv->instance); pkt.server.node = cpu_to_le32(srv->node); pkt.server.port = cpu_to_le32(srv->port); msg.msg_name = (struct sockaddr *)dest; msg.msg_namelen = sizeof(*dest); return kernel_sendmsg(qrtr_ns.sock, &msg, &iv, 1, sizeof(pkt)); } static void service_announce_del(struct sockaddr_qrtr *dest, struct qrtr_server *srv) { struct qrtr_ctrl_pkt pkt; struct msghdr msg = { }; struct kvec iv; int ret; trace_qrtr_ns_service_announce_del(srv->service, srv->instance, srv->node, srv->port); iv.iov_base = &pkt; iv.iov_len = sizeof(pkt); memset(&pkt, 0, sizeof(pkt)); pkt.cmd = cpu_to_le32(QRTR_TYPE_DEL_SERVER); pkt.server.service = cpu_to_le32(srv->service); pkt.server.instance = cpu_to_le32(srv->instance); pkt.server.node = cpu_to_le32(srv->node); pkt.server.port = cpu_to_le32(srv->port); msg.msg_name = (struct sockaddr *)dest; msg.msg_namelen = sizeof(*dest); ret = kernel_sendmsg(qrtr_ns.sock, &msg, &iv, 1, sizeof(pkt)); if (ret < 0 && ret != -ENODEV) pr_err("failed to announce del service\n"); return; } static void lookup_notify(struct sockaddr_qrtr *to, struct qrtr_server *srv, bool new) { struct qrtr_ctrl_pkt pkt; struct msghdr msg = { }; struct kvec iv; int ret; iv.iov_base = &pkt; iv.iov_len = sizeof(pkt); memset(&pkt, 0, sizeof(pkt)); pkt.cmd = new ? cpu_to_le32(QRTR_TYPE_NEW_SERVER) : cpu_to_le32(QRTR_TYPE_DEL_SERVER); if (srv) { pkt.server.service = cpu_to_le32(srv->service); pkt.server.instance = cpu_to_le32(srv->instance); pkt.server.node = cpu_to_le32(srv->node); pkt.server.port = cpu_to_le32(srv->port); } msg.msg_name = (struct sockaddr *)to; msg.msg_namelen = sizeof(*to); ret = kernel_sendmsg(qrtr_ns.sock, &msg, &iv, 1, sizeof(pkt)); if (ret < 0 && ret != -ENODEV) pr_err("failed to send lookup notification\n"); } static int announce_servers(struct sockaddr_qrtr *sq) { struct qrtr_server *srv; struct qrtr_node *node; unsigned long index; int ret; node = node_get(qrtr_ns.local_node); if (!node) return 0; /* Announce the list of servers registered in this node */ xa_for_each(&node->servers, index, srv) { ret = service_announce_new(sq, srv); if (ret < 0) { if (ret == -ENODEV) continue; pr_err("failed to announce new service\n"); return ret; } } return 0; } static struct qrtr_server *server_add(unsigned int service, unsigned int instance, unsigned int node_id, unsigned int port) { struct qrtr_server *srv; struct qrtr_server *old; struct qrtr_node *node; if (!service || !port) return NULL; srv = kzalloc(sizeof(*srv), GFP_KERNEL); if (!srv) return NULL; srv->service = service; srv->instance = instance; srv->node = node_id; srv->port = port; node = node_get(node_id); if (!node) goto err; /* Delete the old server on the same port */ old = xa_store(&node->servers, port, srv, GFP_KERNEL); if (old) { if (xa_is_err(old)) { pr_err("failed to add server [0x%x:0x%x] ret:%d\n", srv->service, srv->instance, xa_err(old)); goto err; } else { kfree(old); } } trace_qrtr_ns_server_add(srv->service, srv->instance, srv->node, srv->port); return srv; err: kfree(srv); return NULL; } static int server_del(struct qrtr_node *node, unsigned int port, bool bcast) { struct qrtr_lookup *lookup; struct qrtr_server *srv; struct list_head *li; srv = xa_load(&node->servers, port); if (!srv) return -ENOENT; xa_erase(&node->servers, port); /* Broadcast the removal of local servers */ if (srv->node == qrtr_ns.local_node && bcast) service_announce_del(&qrtr_ns.bcast_sq, srv); /* Announce the service's disappearance to observers */ list_for_each(li, &qrtr_ns.lookups) { lookup = container_of(li, struct qrtr_lookup, li); if (lookup->service && lookup->service != srv->service) continue; if (lookup->instance && lookup->instance != srv->instance) continue; lookup_notify(&lookup->sq, srv, false); } kfree(srv); return 0; } static int say_hello(struct sockaddr_qrtr *dest) { struct qrtr_ctrl_pkt pkt; struct msghdr msg = { }; struct kvec iv; int ret; iv.iov_base = &pkt; iv.iov_len = sizeof(pkt); memset(&pkt, 0, sizeof(pkt)); pkt.cmd = cpu_to_le32(QRTR_TYPE_HELLO); msg.msg_name = (struct sockaddr *)dest; msg.msg_namelen = sizeof(*dest); ret = kernel_sendmsg(qrtr_ns.sock, &msg, &iv, 1, sizeof(pkt)); if (ret < 0) pr_err("failed to send hello msg\n"); return ret; } /* Announce the list of servers registered on the local node */ static int ctrl_cmd_hello(struct sockaddr_qrtr *sq) { int ret; ret = say_hello(sq); if (ret < 0) return ret; return announce_servers(sq); } static int ctrl_cmd_bye(struct sockaddr_qrtr *from) { struct qrtr_node *local_node; struct qrtr_ctrl_pkt pkt; struct qrtr_server *srv; struct sockaddr_qrtr sq; struct msghdr msg = { }; struct qrtr_node *node; unsigned long index; struct kvec iv; int ret; iv.iov_base = &pkt; iv.iov_len = sizeof(pkt); node = node_get(from->sq_node); if (!node) return 0; /* Advertise removal of this client to all servers of remote node */ xa_for_each(&node->servers, index, srv) server_del(node, srv->port, true); /* Advertise the removal of this client to all local servers */ local_node = node_get(qrtr_ns.local_node); if (!local_node) return 0; memset(&pkt, 0, sizeof(pkt)); pkt.cmd = cpu_to_le32(QRTR_TYPE_BYE); pkt.client.node = cpu_to_le32(from->sq_node); xa_for_each(&local_node->servers, index, srv) { sq.sq_family = AF_QIPCRTR; sq.sq_node = srv->node; sq.sq_port = srv->port; msg.msg_name = (struct sockaddr *)&sq; msg.msg_namelen = sizeof(sq); ret = kernel_sendmsg(qrtr_ns.sock, &msg, &iv, 1, sizeof(pkt)); if (ret < 0 && ret != -ENODEV) { pr_err("failed to send bye cmd\n"); return ret; } } return 0; } static int ctrl_cmd_del_client(struct sockaddr_qrtr *from, unsigned int node_id, unsigned int port) { struct qrtr_node *local_node; struct qrtr_lookup *lookup; struct qrtr_ctrl_pkt pkt; struct msghdr msg = { }; struct qrtr_server *srv; struct sockaddr_qrtr sq; struct qrtr_node *node; struct list_head *tmp; struct list_head *li; unsigned long index; struct kvec iv; int ret; iv.iov_base = &pkt; iv.iov_len = sizeof(pkt); /* Don't accept spoofed messages */ if (from->sq_node != node_id) return -EINVAL; /* Local DEL_CLIENT messages comes from the port being closed */ if (from->sq_node == qrtr_ns.local_node && from->sq_port != port) return -EINVAL; /* Remove any lookups by this client */ list_for_each_safe(li, tmp, &qrtr_ns.lookups) { lookup = container_of(li, struct qrtr_lookup, li); if (lookup->sq.sq_node != node_id) continue; if (lookup->sq.sq_port != port) continue; list_del(&lookup->li); kfree(lookup); } /* Remove the server belonging to this port but don't broadcast * DEL_SERVER. Neighbours would've already removed the server belonging * to this port due to the DEL_CLIENT broadcast from qrtr_port_remove(). */ node = node_get(node_id); if (node) server_del(node, port, false); /* Advertise the removal of this client to all local servers */ local_node = node_get(qrtr_ns.local_node); if (!local_node) return 0; memset(&pkt, 0, sizeof(pkt)); pkt.cmd = cpu_to_le32(QRTR_TYPE_DEL_CLIENT); pkt.client.node = cpu_to_le32(node_id); pkt.client.port = cpu_to_le32(port); xa_for_each(&local_node->servers, index, srv) { sq.sq_family = AF_QIPCRTR; sq.sq_node = srv->node; sq.sq_port = srv->port; msg.msg_name = (struct sockaddr *)&sq; msg.msg_namelen = sizeof(sq); ret = kernel_sendmsg(qrtr_ns.sock, &msg, &iv, 1, sizeof(pkt)); if (ret < 0 && ret != -ENODEV) { pr_err("failed to send del client cmd\n"); return ret; } } return 0; } static int ctrl_cmd_new_server(struct sockaddr_qrtr *from, unsigned int service, unsigned int instance, unsigned int node_id, unsigned int port) { struct qrtr_lookup *lookup; struct qrtr_server *srv; struct list_head *li; int ret = 0; /* Ignore specified node and port for local servers */ if (from->sq_node == qrtr_ns.local_node) { node_id = from->sq_node; port = from->sq_port; } srv = server_add(service, instance, node_id, port); if (!srv) return -EINVAL; if (srv->node == qrtr_ns.local_node) { ret = service_announce_new(&qrtr_ns.bcast_sq, srv); if (ret < 0) { pr_err("failed to announce new service\n"); return ret; } } /* Notify any potential lookups about the new server */ list_for_each(li, &qrtr_ns.lookups) { lookup = container_of(li, struct qrtr_lookup, li); if (lookup->service && lookup->service != service) continue; if (lookup->instance && lookup->instance != instance) continue; lookup_notify(&lookup->sq, srv, true); } return ret; } static int ctrl_cmd_del_server(struct sockaddr_qrtr *from, unsigned int service, unsigned int instance, unsigned int node_id, unsigned int port) { struct qrtr_node *node; /* Ignore specified node and port for local servers*/ if (from->sq_node == qrtr_ns.local_node) { node_id = from->sq_node; port = from->sq_port; } /* Local servers may only unregister themselves */ if (from->sq_node == qrtr_ns.local_node && from->sq_port != port) return -EINVAL; node = node_get(node_id); if (!node) return -ENOENT; server_del(node, port, true); return 0; } static int ctrl_cmd_new_lookup(struct sockaddr_qrtr *from, unsigned int service, unsigned int instance) { struct qrtr_server_filter filter; struct qrtr_lookup *lookup; struct qrtr_server *srv; struct qrtr_node *node; unsigned long node_idx; unsigned long srv_idx; /* Accept only local observers */ if (from->sq_node != qrtr_ns.local_node) return -EINVAL; lookup = kzalloc(sizeof(*lookup), GFP_KERNEL); if (!lookup) return -ENOMEM; lookup->sq = *from; lookup->service = service; lookup->instance = instance; list_add_tail(&lookup->li, &qrtr_ns.lookups); memset(&filter, 0, sizeof(filter)); filter.service = service; filter.instance = instance; xa_for_each(&nodes, node_idx, node) { xa_for_each(&node->servers, srv_idx, srv) { if (!server_match(srv, &filter)) continue; lookup_notify(from, srv, true); } } /* Empty notification, to indicate end of listing */ lookup_notify(from, NULL, true); return 0; } static void ctrl_cmd_del_lookup(struct sockaddr_qrtr *from, unsigned int service, unsigned int instance) { struct qrtr_lookup *lookup; struct list_head *tmp; struct list_head *li; list_for_each_safe(li, tmp, &qrtr_ns.lookups) { lookup = container_of(li, struct qrtr_lookup, li); if (lookup->sq.sq_node != from->sq_node) continue; if (lookup->sq.sq_port != from->sq_port) continue; if (lookup->service != service) continue; if (lookup->instance && lookup->instance != instance) continue; list_del(&lookup->li); kfree(lookup); } } static void qrtr_ns_worker(struct work_struct *work) { const struct qrtr_ctrl_pkt *pkt; size_t recv_buf_size = 4096; struct sockaddr_qrtr sq; struct msghdr msg = { }; unsigned int cmd; ssize_t msglen; void *recv_buf; struct kvec iv; int ret; msg.msg_name = (struct sockaddr *)&sq; msg.msg_namelen = sizeof(sq); recv_buf = kzalloc(recv_buf_size, GFP_KERNEL); if (!recv_buf) return; for (;;) { iv.iov_base = recv_buf; iv.iov_len = recv_buf_size; msglen = kernel_recvmsg(qrtr_ns.sock, &msg, &iv, 1, iv.iov_len, MSG_DONTWAIT); if (msglen == -EAGAIN) break; if (msglen < 0) { pr_err("error receiving packet: %zd\n", msglen); break; } pkt = recv_buf; cmd = le32_to_cpu(pkt->cmd); if (cmd < ARRAY_SIZE(qrtr_ctrl_pkt_strings) && qrtr_ctrl_pkt_strings[cmd]) trace_qrtr_ns_message(qrtr_ctrl_pkt_strings[cmd], sq.sq_node, sq.sq_port); ret = 0; switch (cmd) { case QRTR_TYPE_HELLO: ret = ctrl_cmd_hello(&sq); break; case QRTR_TYPE_BYE: ret = ctrl_cmd_bye(&sq); break; case QRTR_TYPE_DEL_CLIENT: ret = ctrl_cmd_del_client(&sq, le32_to_cpu(pkt->client.node), le32_to_cpu(pkt->client.port)); break; case QRTR_TYPE_NEW_SERVER: ret = ctrl_cmd_new_server(&sq, le32_to_cpu(pkt->server.service), le32_to_cpu(pkt->server.instance), le32_to_cpu(pkt->server.node), le32_to_cpu(pkt->server.port)); break; case QRTR_TYPE_DEL_SERVER: ret = ctrl_cmd_del_server(&sq, le32_to_cpu(pkt->server.service), le32_to_cpu(pkt->server.instance), le32_to_cpu(pkt->server.node), le32_to_cpu(pkt->server.port)); break; case QRTR_TYPE_EXIT: case QRTR_TYPE_PING: case QRTR_TYPE_RESUME_TX: break; case QRTR_TYPE_NEW_LOOKUP: ret = ctrl_cmd_new_lookup(&sq, le32_to_cpu(pkt->server.service), le32_to_cpu(pkt->server.instance)); break; case QRTR_TYPE_DEL_LOOKUP: ctrl_cmd_del_lookup(&sq, le32_to_cpu(pkt->server.service), le32_to_cpu(pkt->server.instance)); break; } if (ret < 0) pr_err("failed while handling packet from %d:%d", sq.sq_node, sq.sq_port); } kfree(recv_buf); } static void qrtr_ns_data_ready(struct sock *sk) { trace_sk_data_ready(sk); queue_work(qrtr_ns.workqueue, &qrtr_ns.work); } int qrtr_ns_init(void) { struct sockaddr_qrtr sq; int ret; INIT_LIST_HEAD(&qrtr_ns.lookups); INIT_WORK(&qrtr_ns.work, qrtr_ns_worker); ret = sock_create_kern(&init_net, AF_QIPCRTR, SOCK_DGRAM, PF_QIPCRTR, &qrtr_ns.sock); if (ret < 0) return ret; ret = kernel_getsockname(qrtr_ns.sock, (struct sockaddr *)&sq); if (ret < 0) { pr_err("failed to get socket name\n"); goto err_sock; } qrtr_ns.workqueue = alloc_ordered_workqueue("qrtr_ns_handler", 0); if (!qrtr_ns.workqueue) { ret = -ENOMEM; goto err_sock; } qrtr_ns.sock->sk->sk_data_ready = qrtr_ns_data_ready; sq.sq_port = QRTR_PORT_CTRL; qrtr_ns.local_node = sq.sq_node; ret = kernel_bind(qrtr_ns.sock, (struct sockaddr *)&sq, sizeof(sq)); if (ret < 0) { pr_err("failed to bind to socket\n"); goto err_wq; } qrtr_ns.bcast_sq.sq_family = AF_QIPCRTR; qrtr_ns.bcast_sq.sq_node = QRTR_NODE_BCAST; qrtr_ns.bcast_sq.sq_port = QRTR_PORT_CTRL; ret = say_hello(&qrtr_ns.bcast_sq); if (ret < 0) goto err_wq; /* As the qrtr ns socket owner and creator is the same module, we have * to decrease the qrtr module reference count to guarantee that it * remains zero after the ns socket is created, otherwise, executing * "rmmod" command is unable to make the qrtr module deleted after the * qrtr module is inserted successfully. * * However, the reference count is increased twice in * sock_create_kern(): one is to increase the reference count of owner * of qrtr socket's proto_ops struct; another is to increment the * reference count of owner of qrtr proto struct. Therefore, we must * decrement the module reference count twice to ensure that it keeps * zero after server's listening socket is created. Of course, we * must bump the module reference count twice as well before the socket * is closed. */ module_put(qrtr_ns.sock->ops->owner); module_put(qrtr_ns.sock->sk->sk_prot_creator->owner); return 0; err_wq: destroy_workqueue(qrtr_ns.workqueue); err_sock: sock_release(qrtr_ns.sock); return ret; } EXPORT_SYMBOL_GPL(qrtr_ns_init); void qrtr_ns_remove(void) { cancel_work_sync(&qrtr_ns.work); destroy_workqueue(qrtr_ns.workqueue); /* sock_release() expects the two references that were put during * qrtr_ns_init(). This function is only called during module remove, * so try_stop_module() has already set the refcnt to 0. Use * __module_get() instead of try_module_get() to successfully take two * references. */ __module_get(qrtr_ns.sock->ops->owner); __module_get(qrtr_ns.sock->sk->sk_prot_creator->owner); sock_release(qrtr_ns.sock); } EXPORT_SYMBOL_GPL(qrtr_ns_remove); MODULE_AUTHOR("Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>"); MODULE_DESCRIPTION("Qualcomm IPC Router Nameservice"); MODULE_LICENSE("Dual BSD/GPL");
41 42 12 12 12 6 50 21 21 68 10 68 68 61 6 67 91 91 45 70 43 5 5 5 24 24 88 1 88 52 52 6 6 19 18 17 19 1 17 16 5 1 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 /* * Copyright (c) 2006, 2020 Oracle and/or its affiliates. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/export.h> #include <linux/skbuff.h> #include <linux/list.h> #include <linux/errqueue.h> #include "rds.h" static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = { [RDS_EXTHDR_NONE] = 0, [RDS_EXTHDR_VERSION] = sizeof(struct rds_ext_header_version), [RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma), [RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest), [RDS_EXTHDR_NPATHS] = sizeof(u16), [RDS_EXTHDR_GEN_NUM] = sizeof(u32), }; void rds_message_addref(struct rds_message *rm) { rdsdebug("addref rm %p ref %d\n", rm, refcount_read(&rm->m_refcount)); refcount_inc(&rm->m_refcount); } EXPORT_SYMBOL_GPL(rds_message_addref); static inline bool rds_zcookie_add(struct rds_msg_zcopy_info *info, u32 cookie) { struct rds_zcopy_cookies *ck = &info->zcookies; int ncookies = ck->num; if (ncookies == RDS_MAX_ZCOOKIES) return false; ck->cookies[ncookies] = cookie; ck->num = ++ncookies; return true; } static struct rds_msg_zcopy_info *rds_info_from_znotifier(struct rds_znotifier *znotif) { return container_of(znotif, struct rds_msg_zcopy_info, znotif); } void rds_notify_msg_zcopy_purge(struct rds_msg_zcopy_queue *q) { unsigned long flags; LIST_HEAD(copy); struct rds_msg_zcopy_info *info, *tmp; spin_lock_irqsave(&q->lock, flags); list_splice(&q->zcookie_head, &copy); INIT_LIST_HEAD(&q->zcookie_head); spin_unlock_irqrestore(&q->lock, flags); list_for_each_entry_safe(info, tmp, &copy, rs_zcookie_next) { list_del(&info->rs_zcookie_next); kfree(info); } } static void rds_rm_zerocopy_callback(struct rds_sock *rs, struct rds_znotifier *znotif) { struct rds_msg_zcopy_info *info; struct rds_msg_zcopy_queue *q; u32 cookie = znotif->z_cookie; struct rds_zcopy_cookies *ck; struct list_head *head; unsigned long flags; mm_unaccount_pinned_pages(&znotif->z_mmp); q = &rs->rs_zcookie_queue; spin_lock_irqsave(&q->lock, flags); head = &q->zcookie_head; if (!list_empty(head)) { info = list_first_entry(head, struct rds_msg_zcopy_info, rs_zcookie_next); if (rds_zcookie_add(info, cookie)) { spin_unlock_irqrestore(&q->lock, flags); kfree(rds_info_from_znotifier(znotif)); /* caller invokes rds_wake_sk_sleep() */ return; } } info = rds_info_from_znotifier(znotif); ck = &info->zcookies; memset(ck, 0, sizeof(*ck)); WARN_ON(!rds_zcookie_add(info, cookie)); list_add_tail(&info->rs_zcookie_next, &q->zcookie_head); spin_unlock_irqrestore(&q->lock, flags); /* caller invokes rds_wake_sk_sleep() */ } /* * This relies on dma_map_sg() not touching sg[].page during merging. */ static void rds_message_purge(struct rds_message *rm) { unsigned long i, flags; bool zcopy = false; if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags))) return; spin_lock_irqsave(&rm->m_rs_lock, flags); if (rm->m_rs) { struct rds_sock *rs = rm->m_rs; if (rm->data.op_mmp_znotifier) { zcopy = true; rds_rm_zerocopy_callback(rs, rm->data.op_mmp_znotifier); rds_wake_sk_sleep(rs); rm->data.op_mmp_znotifier = NULL; } sock_put(rds_rs_to_sk(rs)); rm->m_rs = NULL; } spin_unlock_irqrestore(&rm->m_rs_lock, flags); for (i = 0; i < rm->data.op_nents; i++) { /* XXX will have to put_page for page refs */ if (!zcopy) __free_page(sg_page(&rm->data.op_sg[i])); else put_page(sg_page(&rm->data.op_sg[i])); } rm->data.op_nents = 0; if (rm->rdma.op_active) rds_rdma_free_op(&rm->rdma); if (rm->rdma.op_rdma_mr) kref_put(&rm->rdma.op_rdma_mr->r_kref, __rds_put_mr_final); if (rm->atomic.op_active) rds_atomic_free_op(&rm->atomic); if (rm->atomic.op_rdma_mr) kref_put(&rm->atomic.op_rdma_mr->r_kref, __rds_put_mr_final); } void rds_message_put(struct rds_message *rm) { rdsdebug("put rm %p ref %d\n", rm, refcount_read(&rm->m_refcount)); WARN(!refcount_read(&rm->m_refcount), "danger refcount zero on %p\n", rm); if (refcount_dec_and_test(&rm->m_refcount)) { BUG_ON(!list_empty(&rm->m_sock_item)); BUG_ON(!list_empty(&rm->m_conn_item)); rds_message_purge(rm); kfree(rm); } } EXPORT_SYMBOL_GPL(rds_message_put); void rds_message_populate_header(struct rds_header *hdr, __be16 sport, __be16 dport, u64 seq) { hdr->h_flags = 0; hdr->h_sport = sport; hdr->h_dport = dport; hdr->h_sequence = cpu_to_be64(seq); hdr->h_exthdr[0] = RDS_EXTHDR_NONE; } EXPORT_SYMBOL_GPL(rds_message_populate_header); int rds_message_add_extension(struct rds_header *hdr, unsigned int type, const void *data, unsigned int len) { unsigned int ext_len = sizeof(u8) + len; unsigned char *dst; /* For now, refuse to add more than one extension header */ if (hdr->h_exthdr[0] != RDS_EXTHDR_NONE) return 0; if (type >= __RDS_EXTHDR_MAX || len != rds_exthdr_size[type]) return 0; if (ext_len >= RDS_HEADER_EXT_SPACE) return 0; dst = hdr->h_exthdr; *dst++ = type; memcpy(dst, data, len); dst[len] = RDS_EXTHDR_NONE; return 1; } EXPORT_SYMBOL_GPL(rds_message_add_extension); /* * If a message has extension headers, retrieve them here. * Call like this: * * unsigned int pos = 0; * * while (1) { * buflen = sizeof(buffer); * type = rds_message_next_extension(hdr, &pos, buffer, &buflen); * if (type == RDS_EXTHDR_NONE) * break; * ... * } */ int rds_message_next_extension(struct rds_header *hdr, unsigned int *pos, void *buf, unsigned int *buflen) { unsigned int offset, ext_type, ext_len; u8 *src = hdr->h_exthdr; offset = *pos; if (offset >= RDS_HEADER_EXT_SPACE) goto none; /* Get the extension type and length. For now, the * length is implied by the extension type. */ ext_type = src[offset++]; if (ext_type == RDS_EXTHDR_NONE || ext_type >= __RDS_EXTHDR_MAX) goto none; ext_len = rds_exthdr_size[ext_type]; if (offset + ext_len > RDS_HEADER_EXT_SPACE) goto none; *pos = offset + ext_len; if (ext_len < *buflen) *buflen = ext_len; memcpy(buf, src + offset, *buflen); return ext_type; none: *pos = RDS_HEADER_EXT_SPACE; *buflen = 0; return RDS_EXTHDR_NONE; } int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset) { struct rds_ext_header_rdma_dest ext_hdr; ext_hdr.h_rdma_rkey = cpu_to_be32(r_key); ext_hdr.h_rdma_offset = cpu_to_be32(offset); return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr, sizeof(ext_hdr)); } EXPORT_SYMBOL_GPL(rds_message_add_rdma_dest_extension); /* * Each rds_message is allocated with extra space for the scatterlist entries * rds ops will need. This is to minimize memory allocation count. Then, each rds op * can grab SGs when initializing its part of the rds_message. */ struct rds_message *rds_message_alloc(unsigned int extra_len, gfp_t gfp) { struct rds_message *rm; if (extra_len > KMALLOC_MAX_SIZE - sizeof(struct rds_message)) return NULL; rm = kzalloc(sizeof(struct rds_message) + extra_len, gfp); if (!rm) goto out; rm->m_used_sgs = 0; rm->m_total_sgs = extra_len / sizeof(struct scatterlist); refcount_set(&rm->m_refcount, 1); INIT_LIST_HEAD(&rm->m_sock_item); INIT_LIST_HEAD(&rm->m_conn_item); spin_lock_init(&rm->m_rs_lock); init_waitqueue_head(&rm->m_flush_wait); out: return rm; } /* * RDS ops use this to grab SG entries from the rm's sg pool. */ struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents) { struct scatterlist *sg_first = (struct scatterlist *) &rm[1]; struct scatterlist *sg_ret; if (nents <= 0) { pr_warn("rds: alloc sgs failed! nents <= 0\n"); return ERR_PTR(-EINVAL); } if (rm->m_used_sgs + nents > rm->m_total_sgs) { pr_warn("rds: alloc sgs failed! total %d used %d nents %d\n", rm->m_total_sgs, rm->m_used_sgs, nents); return ERR_PTR(-ENOMEM); } sg_ret = &sg_first[rm->m_used_sgs]; sg_init_table(sg_ret, nents); rm->m_used_sgs += nents; return sg_ret; } struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len) { struct rds_message *rm; unsigned int i; int num_sgs = DIV_ROUND_UP(total_len, PAGE_SIZE); int extra_bytes = num_sgs * sizeof(struct scatterlist); rm = rds_message_alloc(extra_bytes, GFP_NOWAIT); if (!rm) return ERR_PTR(-ENOMEM); set_bit(RDS_MSG_PAGEVEC, &rm->m_flags); rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); rm->data.op_nents = DIV_ROUND_UP(total_len, PAGE_SIZE); rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs); if (IS_ERR(rm->data.op_sg)) { void *err = ERR_CAST(rm->data.op_sg); rds_message_put(rm); return err; } for (i = 0; i < rm->data.op_nents; ++i) { sg_set_page(&rm->data.op_sg[i], virt_to_page((void *)page_addrs[i]), PAGE_SIZE, 0); } return rm; } static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *from) { struct scatterlist *sg; int ret = 0; int length = iov_iter_count(from); struct rds_msg_zcopy_info *info; rm->m_inc.i_hdr.h_len = cpu_to_be32(iov_iter_count(from)); /* * now allocate and copy in the data payload. */ sg = rm->data.op_sg; info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return -ENOMEM; INIT_LIST_HEAD(&info->rs_zcookie_next); rm->data.op_mmp_znotifier = &info->znotif; if (mm_account_pinned_pages(&rm->data.op_mmp_znotifier->z_mmp, length)) { ret = -ENOMEM; goto err; } while (iov_iter_count(from)) { struct page *pages; size_t start; ssize_t copied; copied = iov_iter_get_pages2(from, &pages, PAGE_SIZE, 1, &start); if (copied < 0) { struct mmpin *mmp; int i; for (i = 0; i < rm->data.op_nents; i++) put_page(sg_page(&rm->data.op_sg[i])); mmp = &rm->data.op_mmp_znotifier->z_mmp; mm_unaccount_pinned_pages(mmp); ret = -EFAULT; goto err; } length -= copied; sg_set_page(sg, pages, copied, start); rm->data.op_nents++; sg++; } WARN_ON_ONCE(length != 0); return ret; err: kfree(info); rm->data.op_mmp_znotifier = NULL; return ret; } int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from, bool zcopy) { unsigned long to_copy, nbytes; unsigned long sg_off; struct scatterlist *sg; int ret = 0; rm->m_inc.i_hdr.h_len = cpu_to_be32(iov_iter_count(from)); /* now allocate and copy in the data payload. */ sg = rm->data.op_sg; sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */ if (zcopy) return rds_message_zcopy_from_user(rm, from); while (iov_iter_count(from)) { if (!sg_page(sg)) { ret = rds_page_remainder_alloc(sg, iov_iter_count(from), GFP_HIGHUSER); if (ret) return ret; rm->data.op_nents++; sg_off = 0; } to_copy = min_t(unsigned long, iov_iter_count(from), sg->length - sg_off); rds_stats_add(s_copy_from_user, to_copy); nbytes = copy_page_from_iter(sg_page(sg), sg->offset + sg_off, to_copy, from); if (nbytes != to_copy) return -EFAULT; sg_off += to_copy; if (sg_off == sg->length) sg++; } return ret; } int rds_message_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to) { struct rds_message *rm; struct scatterlist *sg; unsigned long to_copy; unsigned long vec_off; int copied; int ret; u32 len; rm = container_of(inc, struct rds_message, m_inc); len = be32_to_cpu(rm->m_inc.i_hdr.h_len); sg = rm->data.op_sg; vec_off = 0; copied = 0; while (iov_iter_count(to) && copied < len) { to_copy = min_t(unsigned long, iov_iter_count(to), sg->length - vec_off); to_copy = min_t(unsigned long, to_copy, len - copied); rds_stats_add(s_copy_to_user, to_copy); ret = copy_page_to_iter(sg_page(sg), sg->offset + vec_off, to_copy, to); if (ret != to_copy) return -EFAULT; vec_off += to_copy; copied += to_copy; if (vec_off == sg->length) { vec_off = 0; sg++; } } return copied; } /* * If the message is still on the send queue, wait until the transport * is done with it. This is particularly important for RDMA operations. */ void rds_message_wait(struct rds_message *rm) { wait_event_interruptible(rm->m_flush_wait, !test_bit(RDS_MSG_MAPPED, &rm->m_flags)); } void rds_message_unmapped(struct rds_message *rm) { clear_bit(RDS_MSG_MAPPED, &rm->m_flags); wake_up_interruptible(&rm->m_flush_wait); } EXPORT_SYMBOL_GPL(rds_message_unmapped);
4 6 81 144 191 20 62 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 /* * net/tipc/addr.h: Include file for TIPC address utility routines * * Copyright (c) 2000-2006, 2018, Ericsson AB * Copyright (c) 2004-2005, Wind River Systems * Copyright (c) 2020-2021, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _TIPC_ADDR_H #define _TIPC_ADDR_H #include <linux/types.h> #include <linux/tipc.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include "core.h" /* Struct tipc_uaddr: internal version of struct sockaddr_tipc. * Must be kept aligned both regarding field positions and size. */ struct tipc_uaddr { unsigned short family; unsigned char addrtype; signed char scope; union { struct { struct tipc_service_addr sa; u32 lookup_node; }; struct tipc_service_range sr; struct tipc_socket_addr sk; }; }; static inline void tipc_uaddr(struct tipc_uaddr *ua, u32 atype, u32 scope, u32 type, u32 lower, u32 upper) { ua->family = AF_TIPC; ua->addrtype = atype; ua->scope = scope; ua->sr.type = type; ua->sr.lower = lower; ua->sr.upper = upper; } static inline bool tipc_uaddr_valid(struct tipc_uaddr *ua, int len) { u32 atype; if (len < sizeof(struct sockaddr_tipc)) return false; atype = ua->addrtype; if (ua->family != AF_TIPC) return false; if (atype == TIPC_SERVICE_ADDR || atype == TIPC_SOCKET_ADDR) return true; if (atype == TIPC_SERVICE_RANGE) return ua->sr.upper >= ua->sr.lower; return false; } static inline u32 tipc_own_addr(struct net *net) { return tipc_net(net)->node_addr; } static inline u8 *tipc_own_id(struct net *net) { struct tipc_net *tn = tipc_net(net); if (!strlen(tn->node_id_string)) return NULL; return tn->node_id; } static inline char *tipc_own_id_string(struct net *net) { return tipc_net(net)->node_id_string; } static inline u32 tipc_cluster_mask(u32 addr) { return addr & TIPC_ZONE_CLUSTER_MASK; } static inline int tipc_node2scope(u32 node) { return node ? TIPC_NODE_SCOPE : TIPC_CLUSTER_SCOPE; } static inline int tipc_scope2node(struct net *net, int sc) { return sc != TIPC_NODE_SCOPE ? 0 : tipc_own_addr(net); } static inline int in_own_node(struct net *net, u32 addr) { return addr == tipc_own_addr(net) || !addr; } bool tipc_in_scope(bool legacy_format, u32 domain, u32 addr); void tipc_set_node_id(struct net *net, u8 *id); void tipc_set_node_addr(struct net *net, u32 addr); char *tipc_nodeid2string(char *str, u8 *id); #endif
21123 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __ASM_X86_XSAVE_H #define __ASM_X86_XSAVE_H #include <linux/uaccess.h> #include <linux/types.h> #include <asm/processor.h> #include <asm/fpu/api.h> #include <asm/user.h> /* Bit 63 of XCR0 is reserved for future expansion */ #define XFEATURE_MASK_EXTEND (~(XFEATURE_MASK_FPSSE | (1ULL << 63))) #define FXSAVE_SIZE 512 #define XSAVE_HDR_SIZE 64 #define XSAVE_HDR_OFFSET FXSAVE_SIZE #define XSAVE_YMM_SIZE 256 #define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET) #define XSAVE_ALIGNMENT 64 /* All currently supported user features */ #define XFEATURE_MASK_USER_SUPPORTED (XFEATURE_MASK_FP | \ XFEATURE_MASK_SSE | \ XFEATURE_MASK_YMM | \ XFEATURE_MASK_OPMASK | \ XFEATURE_MASK_ZMM_Hi256 | \ XFEATURE_MASK_Hi16_ZMM | \ XFEATURE_MASK_PKRU | \ XFEATURE_MASK_BNDREGS | \ XFEATURE_MASK_BNDCSR | \ XFEATURE_MASK_XTILE) /* * Features which are restored when returning to user space. * PKRU is not restored on return to user space because PKRU * is switched eagerly in switch_to() and flush_thread() */ #define XFEATURE_MASK_USER_RESTORE \ (XFEATURE_MASK_USER_SUPPORTED & ~XFEATURE_MASK_PKRU) /* Features which are dynamically enabled for a process on request */ #define XFEATURE_MASK_USER_DYNAMIC XFEATURE_MASK_XTILE_DATA /* All currently supported supervisor features */ #define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID | \ XFEATURE_MASK_CET_USER) /* * A supervisor state component may not always contain valuable information, * and its size may be huge. Saving/restoring such supervisor state components * at each context switch can cause high CPU and space overhead, which should * be avoided. Such supervisor state components should only be saved/restored * on demand. The on-demand supervisor features are set in this mask. * * Unlike the existing supported supervisor features, an independent supervisor * feature does not allocate a buffer in task->fpu, and the corresponding * supervisor state component cannot be saved/restored at each context switch. * * To support an independent supervisor feature, a developer should follow the * dos and don'ts as below: * - Do dynamically allocate a buffer for the supervisor state component. * - Do manually invoke the XSAVES/XRSTORS instruction to save/restore the * state component to/from the buffer. * - Don't set the bit corresponding to the independent supervisor feature in * IA32_XSS at run time, since it has been set at boot time. */ #define XFEATURE_MASK_INDEPENDENT (XFEATURE_MASK_LBR) /* * Unsupported supervisor features. When a supervisor feature in this mask is * supported in the future, move it to the supported supervisor feature mask. */ #define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT | \ XFEATURE_MASK_CET_KERNEL) /* All supervisor states including supported and unsupported states. */ #define XFEATURE_MASK_SUPERVISOR_ALL (XFEATURE_MASK_SUPERVISOR_SUPPORTED | \ XFEATURE_MASK_INDEPENDENT | \ XFEATURE_MASK_SUPERVISOR_UNSUPPORTED) /* * The feature mask required to restore FPU state: * - All user states which are not eagerly switched in switch_to()/exec() * - The suporvisor states */ #define XFEATURE_MASK_FPSTATE (XFEATURE_MASK_USER_RESTORE | \ XFEATURE_MASK_SUPERVISOR_SUPPORTED) /* * Features in this mask have space allocated in the signal frame, but may not * have that space initialized when the feature is in its init state. */ #define XFEATURE_MASK_SIGFRAME_INITOPT (XFEATURE_MASK_XTILE | \ XFEATURE_MASK_USER_DYNAMIC) extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; extern void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask); int xfeature_size(int xfeature_nr); void xsaves(struct xregs_state *xsave, u64 mask); void xrstors(struct xregs_state *xsave, u64 mask); int xfd_enable_feature(u64 xfd_err); #ifdef CONFIG_X86_64 DECLARE_STATIC_KEY_FALSE(__fpu_state_size_dynamic); #endif #ifdef CONFIG_X86_64 DECLARE_STATIC_KEY_FALSE(__fpu_state_size_dynamic); static __always_inline __pure bool fpu_state_size_dynamic(void) { return static_branch_unlikely(&__fpu_state_size_dynamic); } #else static __always_inline __pure bool fpu_state_size_dynamic(void) { return false; } #endif #endif
4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/sch_blackhole.c Black hole queue * * Authors: Thomas Graf <tgraf@suug.ch> * * Note: Quantum tunneling is not supported. */ #include <linux/init.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <net/pkt_sched.h> static int blackhole_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { qdisc_drop(skb, sch, to_free); return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; } static struct sk_buff *blackhole_dequeue(struct Qdisc *sch) { return NULL; } static struct Qdisc_ops blackhole_qdisc_ops __read_mostly = { .id = "blackhole", .priv_size = 0, .enqueue = blackhole_enqueue, .dequeue = blackhole_dequeue, .peek = blackhole_dequeue, .owner = THIS_MODULE, }; static int __init blackhole_init(void) { return register_qdisc(&blackhole_qdisc_ops); } device_initcall(blackhole_init)
2 2 2 2 2 33 1 1 133 1 2 2 4 4 2 2 2 2 1 1 1 1 1 1 2 2 3 1 2 1 1 122 122 122 121 122 4 4 4 4 4 4 4 28 28 28 27 28 28 28 1 1 1 1 18 18 18 18 19 18 18 16 19 18 18 18 19 19 19 19 19 19 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 /* * Copyright (C) 2017 Netronome Systems, Inc. * * This software is licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this * source tree. * * THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE * OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME * THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. */ #include <linux/debugfs.h> #include <linux/etherdevice.h> #include <linux/ethtool_netlink.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/slab.h> #include <net/netdev_queues.h> #include <net/netdev_rx_queue.h> #include <net/page_pool/helpers.h> #include <net/netlink.h> #include <net/net_shaper.h> #include <net/netdev_lock.h> #include <net/pkt_cls.h> #include <net/rtnetlink.h> #include <net/udp_tunnel.h> #include <net/busy_poll.h> #include "netdevsim.h" MODULE_IMPORT_NS("NETDEV_INTERNAL"); #define NSIM_RING_SIZE 256 static int nsim_napi_rx(struct nsim_rq *rq, struct sk_buff *skb) { if (skb_queue_len(&rq->skb_queue) > NSIM_RING_SIZE) { dev_kfree_skb_any(skb); return NET_RX_DROP; } skb_queue_tail(&rq->skb_queue, skb); return NET_RX_SUCCESS; } static int nsim_forward_skb(struct net_device *dev, struct sk_buff *skb, struct nsim_rq *rq) { return __dev_forward_skb(dev, skb) ?: nsim_napi_rx(rq, skb); } static netdev_tx_t nsim_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct netdevsim *ns = netdev_priv(dev); struct net_device *peer_dev; unsigned int len = skb->len; struct netdevsim *peer_ns; struct netdev_config *cfg; struct nsim_rq *rq; int rxq; rcu_read_lock(); if (!nsim_ipsec_tx(ns, skb)) goto out_drop_free; peer_ns = rcu_dereference(ns->peer); if (!peer_ns) goto out_drop_free; peer_dev = peer_ns->netdev; rxq = skb_get_queue_mapping(skb); if (rxq >= peer_dev->num_rx_queues) rxq = rxq % peer_dev->num_rx_queues; rq = peer_ns->rq[rxq]; cfg = peer_dev->cfg; if (skb_is_nonlinear(skb) && (cfg->hds_config != ETHTOOL_TCP_DATA_SPLIT_ENABLED || (cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED && cfg->hds_thresh > len))) skb_linearize(skb); skb_tx_timestamp(skb); if (unlikely(nsim_forward_skb(peer_dev, skb, rq) == NET_RX_DROP)) goto out_drop_cnt; if (!hrtimer_active(&rq->napi_timer)) hrtimer_start(&rq->napi_timer, us_to_ktime(5), HRTIMER_MODE_REL); rcu_read_unlock(); u64_stats_update_begin(&ns->syncp); ns->tx_packets++; ns->tx_bytes += len; u64_stats_update_end(&ns->syncp); return NETDEV_TX_OK; out_drop_free: dev_kfree_skb(skb); out_drop_cnt: rcu_read_unlock(); u64_stats_update_begin(&ns->syncp); ns->tx_dropped++; u64_stats_update_end(&ns->syncp); return NETDEV_TX_OK; } static void nsim_set_rx_mode(struct net_device *dev) { } static int nsim_change_mtu(struct net_device *dev, int new_mtu) { struct netdevsim *ns = netdev_priv(dev); if (ns->xdp.prog && !ns->xdp.prog->aux->xdp_has_frags && new_mtu > NSIM_XDP_MAX_MTU) return -EBUSY; WRITE_ONCE(dev->mtu, new_mtu); return 0; } static void nsim_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { struct netdevsim *ns = netdev_priv(dev); unsigned int start; do { start = u64_stats_fetch_begin(&ns->syncp); stats->tx_bytes = ns->tx_bytes; stats->tx_packets = ns->tx_packets; stats->tx_dropped = ns->tx_dropped; } while (u64_stats_fetch_retry(&ns->syncp, start)); } static int nsim_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv) { return nsim_bpf_setup_tc_block_cb(type, type_data, cb_priv); } static int nsim_set_vf_mac(struct net_device *dev, int vf, u8 *mac) { struct netdevsim *ns = netdev_priv(dev); struct nsim_dev *nsim_dev = ns->nsim_dev; /* Only refuse multicast addresses, zero address can mean unset/any. */ if (vf >= nsim_dev_get_vfs(nsim_dev) || is_multicast_ether_addr(mac)) return -EINVAL; memcpy(nsim_dev->vfconfigs[vf].vf_mac, mac, ETH_ALEN); return 0; } static int nsim_set_vf_vlan(struct net_device *dev, int vf, u16 vlan, u8 qos, __be16 vlan_proto) { struct netdevsim *ns = netdev_priv(dev); struct nsim_dev *nsim_dev = ns->nsim_dev; if (vf >= nsim_dev_get_vfs(nsim_dev) || vlan > 4095 || qos > 7) return -EINVAL; nsim_dev->vfconfigs[vf].vlan = vlan; nsim_dev->vfconfigs[vf].qos = qos; nsim_dev->vfconfigs[vf].vlan_proto = vlan_proto; return 0; } static int nsim_set_vf_rate(struct net_device *dev, int vf, int min, int max) { struct netdevsim *ns = netdev_priv(dev); struct nsim_dev *nsim_dev = ns->nsim_dev; if (nsim_esw_mode_is_switchdev(ns->nsim_dev)) { pr_err("Not supported in switchdev mode. Please use devlink API.\n"); return -EOPNOTSUPP; } if (vf >= nsim_dev_get_vfs(nsim_dev)) return -EINVAL; nsim_dev->vfconfigs[vf].min_tx_rate = min; nsim_dev->vfconfigs[vf].max_tx_rate = max; return 0; } static int nsim_set_vf_spoofchk(struct net_device *dev, int vf, bool val) { struct netdevsim *ns = netdev_priv(dev); struct nsim_dev *nsim_dev = ns->nsim_dev; if (vf >= nsim_dev_get_vfs(nsim_dev)) return -EINVAL; nsim_dev->vfconfigs[vf].spoofchk_enabled = val; return 0; } static int nsim_set_vf_rss_query_en(struct net_device *dev, int vf, bool val) { struct netdevsim *ns = netdev_priv(dev); struct nsim_dev *nsim_dev = ns->nsim_dev; if (vf >= nsim_dev_get_vfs(nsim_dev)) return -EINVAL; nsim_dev->vfconfigs[vf].rss_query_enabled = val; return 0; } static int nsim_set_vf_trust(struct net_device *dev, int vf, bool val) { struct netdevsim *ns = netdev_priv(dev); struct nsim_dev *nsim_dev = ns->nsim_dev; if (vf >= nsim_dev_get_vfs(nsim_dev)) return -EINVAL; nsim_dev->vfconfigs[vf].trusted = val; return 0; } static int nsim_get_vf_config(struct net_device *dev, int vf, struct ifla_vf_info *ivi) { struct netdevsim *ns = netdev_priv(dev); struct nsim_dev *nsim_dev = ns->nsim_dev; if (vf >= nsim_dev_get_vfs(nsim_dev)) return -EINVAL; ivi->vf = vf; ivi->linkstate = nsim_dev->vfconfigs[vf].link_state; ivi->min_tx_rate = nsim_dev->vfconfigs[vf].min_tx_rate; ivi->max_tx_rate = nsim_dev->vfconfigs[vf].max_tx_rate; ivi->vlan = nsim_dev->vfconfigs[vf].vlan; ivi->vlan_proto = nsim_dev->vfconfigs[vf].vlan_proto; ivi->qos = nsim_dev->vfconfigs[vf].qos; memcpy(&ivi->mac, nsim_dev->vfconfigs[vf].vf_mac, ETH_ALEN); ivi->spoofchk = nsim_dev->vfconfigs[vf].spoofchk_enabled; ivi->trusted = nsim_dev->vfconfigs[vf].trusted; ivi->rss_query_en = nsim_dev->vfconfigs[vf].rss_query_enabled; return 0; } static int nsim_set_vf_link_state(struct net_device *dev, int vf, int state) { struct netdevsim *ns = netdev_priv(dev); struct nsim_dev *nsim_dev = ns->nsim_dev; if (vf >= nsim_dev_get_vfs(nsim_dev)) return -EINVAL; switch (state) { case IFLA_VF_LINK_STATE_AUTO: case IFLA_VF_LINK_STATE_ENABLE: case IFLA_VF_LINK_STATE_DISABLE: break; default: return -EINVAL; } nsim_dev->vfconfigs[vf].link_state = state; return 0; } static void nsim_taprio_stats(struct tc_taprio_qopt_stats *stats) { stats->window_drops = 0; stats->tx_overruns = 0; } static int nsim_setup_tc_taprio(struct net_device *dev, struct tc_taprio_qopt_offload *offload) { int err = 0; switch (offload->cmd) { case TAPRIO_CMD_REPLACE: case TAPRIO_CMD_DESTROY: break; case TAPRIO_CMD_STATS: nsim_taprio_stats(&offload->stats); break; default: err = -EOPNOTSUPP; } return err; } static LIST_HEAD(nsim_block_cb_list); static int nsim_setup_tc(struct net_device *dev, enum tc_setup_type type, void *type_data) { struct netdevsim *ns = netdev_priv(dev); switch (type) { case TC_SETUP_QDISC_TAPRIO: return nsim_setup_tc_taprio(dev, type_data); case TC_SETUP_BLOCK: return flow_block_cb_setup_simple(type_data, &nsim_block_cb_list, nsim_setup_tc_block_cb, ns, ns, true); default: return -EOPNOTSUPP; } } static int nsim_set_features(struct net_device *dev, netdev_features_t features) { struct netdevsim *ns = netdev_priv(dev); if ((dev->features & NETIF_F_HW_TC) > (features & NETIF_F_HW_TC)) return nsim_bpf_disable_tc(ns); return 0; } static int nsim_get_iflink(const struct net_device *dev) { struct netdevsim *nsim, *peer; int iflink; nsim = netdev_priv(dev); rcu_read_lock(); peer = rcu_dereference(nsim->peer); iflink = peer ? READ_ONCE(peer->netdev->ifindex) : READ_ONCE(dev->ifindex); rcu_read_unlock(); return iflink; } static int nsim_rcv(struct nsim_rq *rq, int budget) { struct sk_buff *skb; int i; for (i = 0; i < budget; i++) { if (skb_queue_empty(&rq->skb_queue)) break; skb = skb_dequeue(&rq->skb_queue); skb_mark_napi_id(skb, &rq->napi); netif_receive_skb(skb); } return i; } static int nsim_poll(struct napi_struct *napi, int budget) { struct nsim_rq *rq = container_of(napi, struct nsim_rq, napi); int done; done = nsim_rcv(rq, budget); napi_complete(napi); return done; } static int nsim_create_page_pool(struct page_pool **p, struct napi_struct *napi) { struct page_pool_params params = { .order = 0, .pool_size = NSIM_RING_SIZE, .nid = NUMA_NO_NODE, .dev = &napi->dev->dev, .napi = napi, .dma_dir = DMA_BIDIRECTIONAL, .netdev = napi->dev, }; struct page_pool *pool; pool = page_pool_create(&params); if (IS_ERR(pool)) return PTR_ERR(pool); *p = pool; return 0; } static int nsim_init_napi(struct netdevsim *ns) { struct net_device *dev = ns->netdev; struct nsim_rq *rq; int err, i; for (i = 0; i < dev->num_rx_queues; i++) { rq = ns->rq[i]; netif_napi_add_config_locked(dev, &rq->napi, nsim_poll, i); } for (i = 0; i < dev->num_rx_queues; i++) { rq = ns->rq[i]; err = nsim_create_page_pool(&rq->page_pool, &rq->napi); if (err) goto err_pp_destroy; } return 0; err_pp_destroy: while (i--) { page_pool_destroy(ns->rq[i]->page_pool); ns->rq[i]->page_pool = NULL; } for (i = 0; i < dev->num_rx_queues; i++) __netif_napi_del_locked(&ns->rq[i]->napi); return err; } static enum hrtimer_restart nsim_napi_schedule(struct hrtimer *timer) { struct nsim_rq *rq; rq = container_of(timer, struct nsim_rq, napi_timer); napi_schedule(&rq->napi); return HRTIMER_NORESTART; } static void nsim_rq_timer_init(struct nsim_rq *rq) { hrtimer_setup(&rq->napi_timer, nsim_napi_schedule, CLOCK_MONOTONIC, HRTIMER_MODE_REL); } static void nsim_enable_napi(struct netdevsim *ns) { struct net_device *dev = ns->netdev; int i; for (i = 0; i < dev->num_rx_queues; i++) { struct nsim_rq *rq = ns->rq[i]; netif_queue_set_napi(dev, i, NETDEV_QUEUE_TYPE_RX, &rq->napi); napi_enable_locked(&rq->napi); } } static int nsim_open(struct net_device *dev) { struct netdevsim *ns = netdev_priv(dev); int err; netdev_assert_locked(dev); err = nsim_init_napi(ns); if (err) return err; nsim_enable_napi(ns); return 0; } static void nsim_del_napi(struct netdevsim *ns) { struct net_device *dev = ns->netdev; int i; for (i = 0; i < dev->num_rx_queues; i++) { struct nsim_rq *rq = ns->rq[i]; napi_disable_locked(&rq->napi); __netif_napi_del_locked(&rq->napi); } synchronize_net(); for (i = 0; i < dev->num_rx_queues; i++) { page_pool_destroy(ns->rq[i]->page_pool); ns->rq[i]->page_pool = NULL; } } static int nsim_stop(struct net_device *dev) { struct netdevsim *ns = netdev_priv(dev); struct netdevsim *peer; netdev_assert_locked(dev); netif_carrier_off(dev); peer = rtnl_dereference(ns->peer); if (peer) netif_carrier_off(peer->netdev); nsim_del_napi(ns); return 0; } static int nsim_shaper_set(struct net_shaper_binding *binding, const struct net_shaper *shaper, struct netlink_ext_ack *extack) { return 0; } static int nsim_shaper_del(struct net_shaper_binding *binding, const struct net_shaper_handle *handle, struct netlink_ext_ack *extack) { return 0; } static int nsim_shaper_group(struct net_shaper_binding *binding, int leaves_count, const struct net_shaper *leaves, const struct net_shaper *root, struct netlink_ext_ack *extack) { return 0; } static void nsim_shaper_cap(struct net_shaper_binding *binding, enum net_shaper_scope scope, unsigned long *flags) { *flags = ULONG_MAX; } static const struct net_shaper_ops nsim_shaper_ops = { .set = nsim_shaper_set, .delete = nsim_shaper_del, .group = nsim_shaper_group, .capabilities = nsim_shaper_cap, }; static const struct net_device_ops nsim_netdev_ops = { .ndo_start_xmit = nsim_start_xmit, .ndo_set_rx_mode = nsim_set_rx_mode, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_change_mtu = nsim_change_mtu, .ndo_get_stats64 = nsim_get_stats64, .ndo_set_vf_mac = nsim_set_vf_mac, .ndo_set_vf_vlan = nsim_set_vf_vlan, .ndo_set_vf_rate = nsim_set_vf_rate, .ndo_set_vf_spoofchk = nsim_set_vf_spoofchk, .ndo_set_vf_trust = nsim_set_vf_trust, .ndo_get_vf_config = nsim_get_vf_config, .ndo_set_vf_link_state = nsim_set_vf_link_state, .ndo_set_vf_rss_query_en = nsim_set_vf_rss_query_en, .ndo_setup_tc = nsim_setup_tc, .ndo_set_features = nsim_set_features, .ndo_get_iflink = nsim_get_iflink, .ndo_bpf = nsim_bpf, .ndo_open = nsim_open, .ndo_stop = nsim_stop, .net_shaper_ops = &nsim_shaper_ops, }; static const struct net_device_ops nsim_vf_netdev_ops = { .ndo_start_xmit = nsim_start_xmit, .ndo_set_rx_mode = nsim_set_rx_mode, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_change_mtu = nsim_change_mtu, .ndo_get_stats64 = nsim_get_stats64, .ndo_setup_tc = nsim_setup_tc, .ndo_set_features = nsim_set_features, }; /* We don't have true per-queue stats, yet, so do some random fakery here. * Only report stuff for queue 0. */ static void nsim_get_queue_stats_rx(struct net_device *dev, int idx, struct netdev_queue_stats_rx *stats) { struct rtnl_link_stats64 rtstats = {}; if (!idx) nsim_get_stats64(dev, &rtstats); stats->packets = rtstats.rx_packets - !!rtstats.rx_packets; stats->bytes = rtstats.rx_bytes; } static void nsim_get_queue_stats_tx(struct net_device *dev, int idx, struct netdev_queue_stats_tx *stats) { struct rtnl_link_stats64 rtstats = {}; if (!idx) nsim_get_stats64(dev, &rtstats); stats->packets = rtstats.tx_packets - !!rtstats.tx_packets; stats->bytes = rtstats.tx_bytes; } static void nsim_get_base_stats(struct net_device *dev, struct netdev_queue_stats_rx *rx, struct netdev_queue_stats_tx *tx) { struct rtnl_link_stats64 rtstats = {}; nsim_get_stats64(dev, &rtstats); rx->packets = !!rtstats.rx_packets; rx->bytes = 0; tx->packets = !!rtstats.tx_packets; tx->bytes = 0; } static const struct netdev_stat_ops nsim_stat_ops = { .get_queue_stats_tx = nsim_get_queue_stats_tx, .get_queue_stats_rx = nsim_get_queue_stats_rx, .get_base_stats = nsim_get_base_stats, }; static struct nsim_rq *nsim_queue_alloc(void) { struct nsim_rq *rq; rq = kzalloc(sizeof(*rq), GFP_KERNEL_ACCOUNT); if (!rq) return NULL; skb_queue_head_init(&rq->skb_queue); nsim_rq_timer_init(rq); return rq; } static void nsim_queue_free(struct nsim_rq *rq) { hrtimer_cancel(&rq->napi_timer); skb_queue_purge_reason(&rq->skb_queue, SKB_DROP_REASON_QUEUE_PURGE); kfree(rq); } /* Queue reset mode is controlled by ns->rq_reset_mode. * - normal - new NAPI new pool (old NAPI enabled when new added) * - mode 1 - allocate new pool (NAPI is only disabled / enabled) * - mode 2 - new NAPI new pool (old NAPI removed before new added) * - mode 3 - new NAPI new pool (old NAPI disabled when new added) */ struct nsim_queue_mem { struct nsim_rq *rq; struct page_pool *pp; }; static int nsim_queue_mem_alloc(struct net_device *dev, void *per_queue_mem, int idx) { struct nsim_queue_mem *qmem = per_queue_mem; struct netdevsim *ns = netdev_priv(dev); int err; if (ns->rq_reset_mode > 3) return -EINVAL; if (ns->rq_reset_mode == 1) { if (!netif_running(ns->netdev)) return -ENETDOWN; return nsim_create_page_pool(&qmem->pp, &ns->rq[idx]->napi); } qmem->rq = nsim_queue_alloc(); if (!qmem->rq) return -ENOMEM; err = nsim_create_page_pool(&qmem->rq->page_pool, &qmem->rq->napi); if (err) goto err_free; if (!ns->rq_reset_mode) netif_napi_add_config_locked(dev, &qmem->rq->napi, nsim_poll, idx); return 0; err_free: nsim_queue_free(qmem->rq); return err; } static void nsim_queue_mem_free(struct net_device *dev, void *per_queue_mem) { struct nsim_queue_mem *qmem = per_queue_mem; struct netdevsim *ns = netdev_priv(dev); page_pool_destroy(qmem->pp); if (qmem->rq) { if (!ns->rq_reset_mode) netif_napi_del_locked(&qmem->rq->napi); page_pool_destroy(qmem->rq->page_pool); nsim_queue_free(qmem->rq); } } static int nsim_queue_start(struct net_device *dev, void *per_queue_mem, int idx) { struct nsim_queue_mem *qmem = per_queue_mem; struct netdevsim *ns = netdev_priv(dev); netdev_assert_locked(dev); if (ns->rq_reset_mode == 1) { ns->rq[idx]->page_pool = qmem->pp; napi_enable_locked(&ns->rq[idx]->napi); return 0; } /* netif_napi_add()/_del() should normally be called from alloc/free, * here we want to test various call orders. */ if (ns->rq_reset_mode == 2) { netif_napi_del_locked(&ns->rq[idx]->napi); netif_napi_add_config_locked(dev, &qmem->rq->napi, nsim_poll, idx); } else if (ns->rq_reset_mode == 3) { netif_napi_add_config_locked(dev, &qmem->rq->napi, nsim_poll, idx); netif_napi_del_locked(&ns->rq[idx]->napi); } ns->rq[idx] = qmem->rq; napi_enable_locked(&ns->rq[idx]->napi); return 0; } static int nsim_queue_stop(struct net_device *dev, void *per_queue_mem, int idx) { struct nsim_queue_mem *qmem = per_queue_mem; struct netdevsim *ns = netdev_priv(dev); netdev_assert_locked(dev); napi_disable_locked(&ns->rq[idx]->napi); if (ns->rq_reset_mode == 1) { qmem->pp = ns->rq[idx]->page_pool; page_pool_disable_direct_recycling(qmem->pp); } else { qmem->rq = ns->rq[idx]; } return 0; } static const struct netdev_queue_mgmt_ops nsim_queue_mgmt_ops = { .ndo_queue_mem_size = sizeof(struct nsim_queue_mem), .ndo_queue_mem_alloc = nsim_queue_mem_alloc, .ndo_queue_mem_free = nsim_queue_mem_free, .ndo_queue_start = nsim_queue_start, .ndo_queue_stop = nsim_queue_stop, }; static ssize_t nsim_qreset_write(struct file *file, const char __user *data, size_t count, loff_t *ppos) { struct netdevsim *ns = file->private_data; unsigned int queue, mode; char buf[32]; ssize_t ret; if (count >= sizeof(buf)) return -EINVAL; if (copy_from_user(buf, data, count)) return -EFAULT; buf[count] = '\0'; ret = sscanf(buf, "%u %u", &queue, &mode); if (ret != 2) return -EINVAL; netdev_lock(ns->netdev); if (queue >= ns->netdev->real_num_rx_queues) { ret = -EINVAL; goto exit_unlock; } ns->rq_reset_mode = mode; ret = netdev_rx_queue_restart(ns->netdev, queue); ns->rq_reset_mode = 0; if (ret) goto exit_unlock; ret = count; exit_unlock: netdev_unlock(ns->netdev); return ret; } static const struct file_operations nsim_qreset_fops = { .open = simple_open, .write = nsim_qreset_write, .owner = THIS_MODULE, }; static ssize_t nsim_pp_hold_read(struct file *file, char __user *data, size_t count, loff_t *ppos) { struct netdevsim *ns = file->private_data; char buf[3] = "n\n"; if (ns->page) buf[0] = 'y'; return simple_read_from_buffer(data, count, ppos, buf, 2); } static ssize_t nsim_pp_hold_write(struct file *file, const char __user *data, size_t count, loff_t *ppos) { struct netdevsim *ns = file->private_data; ssize_t ret; bool val; ret = kstrtobool_from_user(data, count, &val); if (ret) return ret; rtnl_lock(); ret = count; if (val == !!ns->page) goto exit; if (!netif_running(ns->netdev) && val) { ret = -ENETDOWN; } else if (val) { ns->page = page_pool_dev_alloc_pages(ns->rq[0]->page_pool); if (!ns->page) ret = -ENOMEM; } else { page_pool_put_full_page(ns->page->pp, ns->page, false); ns->page = NULL; } exit: rtnl_unlock(); return ret; } static const struct file_operations nsim_pp_hold_fops = { .open = simple_open, .read = nsim_pp_hold_read, .write = nsim_pp_hold_write, .llseek = generic_file_llseek, .owner = THIS_MODULE, }; static void nsim_setup(struct net_device *dev) { ether_setup(dev); eth_hw_addr_random(dev); dev->tx_queue_len = 0; dev->flags &= ~IFF_MULTICAST; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE; dev->features |= NETIF_F_HIGHDMA | NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | NETIF_F_TSO; dev->hw_features |= NETIF_F_HW_TC | NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | NETIF_F_TSO; dev->max_mtu = ETH_MAX_MTU; dev->xdp_features = NETDEV_XDP_ACT_HW_OFFLOAD; } static int nsim_queue_init(struct netdevsim *ns) { struct net_device *dev = ns->netdev; int i; ns->rq = kcalloc(dev->num_rx_queues, sizeof(*ns->rq), GFP_KERNEL_ACCOUNT); if (!ns->rq) return -ENOMEM; for (i = 0; i < dev->num_rx_queues; i++) { ns->rq[i] = nsim_queue_alloc(); if (!ns->rq[i]) goto err_free_prev; } return 0; err_free_prev: while (i--) kfree(ns->rq[i]); kfree(ns->rq); return -ENOMEM; } static void nsim_queue_uninit(struct netdevsim *ns) { struct net_device *dev = ns->netdev; int i; for (i = 0; i < dev->num_rx_queues; i++) nsim_queue_free(ns->rq[i]); kfree(ns->rq); ns->rq = NULL; } static int nsim_init_netdevsim(struct netdevsim *ns) { struct mock_phc *phc; int err; phc = mock_phc_create(&ns->nsim_bus_dev->dev); if (IS_ERR(phc)) return PTR_ERR(phc); ns->phc = phc; ns->netdev->netdev_ops = &nsim_netdev_ops; ns->netdev->stat_ops = &nsim_stat_ops; ns->netdev->queue_mgmt_ops = &nsim_queue_mgmt_ops; netdev_lockdep_set_classes(ns->netdev); err = nsim_udp_tunnels_info_create(ns->nsim_dev, ns->netdev); if (err) goto err_phc_destroy; rtnl_lock(); err = nsim_queue_init(ns); if (err) goto err_utn_destroy; err = nsim_bpf_init(ns); if (err) goto err_rq_destroy; nsim_macsec_init(ns); nsim_ipsec_init(ns); err = register_netdevice(ns->netdev); if (err) goto err_ipsec_teardown; rtnl_unlock(); if (IS_ENABLED(CONFIG_DEBUG_NET)) { ns->nb.notifier_call = netdev_debug_event; if (register_netdevice_notifier_dev_net(ns->netdev, &ns->nb, &ns->nn)) ns->nb.notifier_call = NULL; } return 0; err_ipsec_teardown: nsim_ipsec_teardown(ns); nsim_macsec_teardown(ns); nsim_bpf_uninit(ns); err_rq_destroy: nsim_queue_uninit(ns); err_utn_destroy: rtnl_unlock(); nsim_udp_tunnels_info_destroy(ns->netdev); err_phc_destroy: mock_phc_destroy(ns->phc); return err; } static int nsim_init_netdevsim_vf(struct netdevsim *ns) { int err; ns->netdev->netdev_ops = &nsim_vf_netdev_ops; rtnl_lock(); err = register_netdevice(ns->netdev); rtnl_unlock(); return err; } static void nsim_exit_netdevsim(struct netdevsim *ns) { nsim_udp_tunnels_info_destroy(ns->netdev); mock_phc_destroy(ns->phc); } struct netdevsim * nsim_create(struct nsim_dev *nsim_dev, struct nsim_dev_port *nsim_dev_port) { struct net_device *dev; struct netdevsim *ns; int err; dev = alloc_netdev_mq(sizeof(*ns), "eth%d", NET_NAME_UNKNOWN, nsim_setup, nsim_dev->nsim_bus_dev->num_queues); if (!dev) return ERR_PTR(-ENOMEM); dev_net_set(dev, nsim_dev_net(nsim_dev)); ns = netdev_priv(dev); ns->netdev = dev; u64_stats_init(&ns->syncp); ns->nsim_dev = nsim_dev; ns->nsim_dev_port = nsim_dev_port; ns->nsim_bus_dev = nsim_dev->nsim_bus_dev; SET_NETDEV_DEV(dev, &ns->nsim_bus_dev->dev); SET_NETDEV_DEVLINK_PORT(dev, &nsim_dev_port->devlink_port); nsim_ethtool_init(ns); if (nsim_dev_port_is_pf(nsim_dev_port)) err = nsim_init_netdevsim(ns); else err = nsim_init_netdevsim_vf(ns); if (err) goto err_free_netdev; ns->pp_dfs = debugfs_create_file("pp_hold", 0600, nsim_dev_port->ddir, ns, &nsim_pp_hold_fops); ns->qr_dfs = debugfs_create_file("queue_reset", 0200, nsim_dev_port->ddir, ns, &nsim_qreset_fops); return ns; err_free_netdev: free_netdev(dev); return ERR_PTR(err); } void nsim_destroy(struct netdevsim *ns) { struct net_device *dev = ns->netdev; struct netdevsim *peer; debugfs_remove(ns->qr_dfs); debugfs_remove(ns->pp_dfs); if (ns->nb.notifier_call) unregister_netdevice_notifier_dev_net(ns->netdev, &ns->nb, &ns->nn); rtnl_lock(); peer = rtnl_dereference(ns->peer); if (peer) RCU_INIT_POINTER(peer->peer, NULL); RCU_INIT_POINTER(ns->peer, NULL); unregister_netdevice(dev); if (nsim_dev_port_is_pf(ns->nsim_dev_port)) { nsim_macsec_teardown(ns); nsim_ipsec_teardown(ns); nsim_bpf_uninit(ns); nsim_queue_uninit(ns); } rtnl_unlock(); if (nsim_dev_port_is_pf(ns->nsim_dev_port)) nsim_exit_netdevsim(ns); /* Put this intentionally late to exercise the orphaning path */ if (ns->page) { page_pool_put_full_page(ns->page->pp, ns->page, false); ns->page = NULL; } free_netdev(dev); } bool netdev_is_nsim(struct net_device *dev) { return dev->netdev_ops == &nsim_netdev_ops; } static int nsim_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { NL_SET_ERR_MSG_MOD(extack, "Please use: echo \"[ID] [PORT_COUNT] [NUM_QUEUES]\" > /sys/bus/netdevsim/new_device"); return -EOPNOTSUPP; } static struct rtnl_link_ops nsim_link_ops __read_mostly = { .kind = DRV_NAME, .validate = nsim_validate, }; static int __init nsim_module_init(void) { int err; err = nsim_dev_init(); if (err) return err; err = nsim_bus_init(); if (err) goto err_dev_exit; err = rtnl_link_register(&nsim_link_ops); if (err) goto err_bus_exit; return 0; err_bus_exit: nsim_bus_exit(); err_dev_exit: nsim_dev_exit(); return err; } static void __exit nsim_module_exit(void) { rtnl_link_unregister(&nsim_link_ops); nsim_bus_exit(); nsim_dev_exit(); } module_init(nsim_module_init); module_exit(nsim_module_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Simulated networking device for testing"); MODULE_ALIAS_RTNL_LINK(DRV_NAME);
113 363 364 363 364 12 7 57 9 9 6 4 94 78 16 1 92 57 60 29 60 29 9 9 3 6 16 16 150 148 150 2 2 2 143 7 2 128 118 107 10 1 3 115 65 49 2 8 105 113 72 41 72 41 1 73 16 19 8 1 827 531 304 328 101 308 211 304 474 473 473 6 6 2 3 2 1 2 3 5 134 135 21 18 21 6 21 40 41 3 18 10 13 23 22 20 11 50 5 3 20 3 20 4 29 55 55 24 31 31 28 43 42 42 45 11 2 22 23 23 23 5 2 9 7 25 2 36 2 34 33 34 2 2 32 22 9 2 32 9 2 11 11 6 26 1 25 3 25 24 24 24 25 17 1 17 17 3 11 4 14 39 1 38 9 1 29 38 9 18 28 37 18 157 157 143 35 87 87 80 40 4 4 7 7 13 4 9 4 9 15 9 7 3 10 3 7 4 6 2 3 38 3 35 3 2 1 26 2 10 12 1 22 22 13 13 13 8 22 6 4 8 1 9 4 9 13 3 6 21 11 13 2 11 18 18 8 5 4 9 3 3 3 2 6 6 50 28 16 6 119 96 26 26 128 25 57 47 47 6 6 2 4 6 55 56 39 17 55 20 1 19 3 3 4 4 2 2 4 13 6 8 8 7 1 15 4 1 10 1 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook */ #include <linux/bpf.h> #include <linux/btf.h> #include <linux/jhash.h> #include <linux/filter.h> #include <linux/rculist_nulls.h> #include <linux/rcupdate_wait.h> #include <linux/random.h> #include <uapi/linux/btf.h> #include <linux/rcupdate_trace.h> #include <linux/btf_ids.h> #include "percpu_freelist.h" #include "bpf_lru_list.h" #include "map_in_map.h" #include <linux/bpf_mem_alloc.h> #include <asm/rqspinlock.h> #define HTAB_CREATE_FLAG_MASK \ (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED) #define BATCH_OPS(_name) \ .map_lookup_batch = \ _name##_map_lookup_batch, \ .map_lookup_and_delete_batch = \ _name##_map_lookup_and_delete_batch, \ .map_update_batch = \ generic_map_update_batch, \ .map_delete_batch = \ generic_map_delete_batch /* * The bucket lock has two protection scopes: * * 1) Serializing concurrent operations from BPF programs on different * CPUs * * 2) Serializing concurrent operations from BPF programs and sys_bpf() * * BPF programs can execute in any context including perf, kprobes and * tracing. As there are almost no limits where perf, kprobes and tracing * can be invoked from the lock operations need to be protected against * deadlocks. Deadlocks can be caused by recursion and by an invocation in * the lock held section when functions which acquire this lock are invoked * from sys_bpf(). BPF recursion is prevented by incrementing the per CPU * variable bpf_prog_active, which prevents BPF programs attached to perf * events, kprobes and tracing to be invoked before the prior invocation * from one of these contexts completed. sys_bpf() uses the same mechanism * by pinning the task to the current CPU and incrementing the recursion * protection across the map operation. * * This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain * operations like memory allocations (even with GFP_ATOMIC) from atomic * contexts. This is required because even with GFP_ATOMIC the memory * allocator calls into code paths which acquire locks with long held lock * sections. To ensure the deterministic behaviour these locks are regular * spinlocks, which are converted to 'sleepable' spinlocks on RT. The only * true atomic contexts on an RT kernel are the low level hardware * handling, scheduling, low level interrupt handling, NMIs etc. None of * these contexts should ever do memory allocations. * * As regular device interrupt handlers and soft interrupts are forced into * thread context, the existing code which does * spin_lock*(); alloc(GFP_ATOMIC); spin_unlock*(); * just works. * * In theory the BPF locks could be converted to regular spinlocks as well, * but the bucket locks and percpu_freelist locks can be taken from * arbitrary contexts (perf, kprobes, tracepoints) which are required to be * atomic contexts even on RT. Before the introduction of bpf_mem_alloc, * it is only safe to use raw spinlock for preallocated hash map on a RT kernel, * because there is no memory allocation within the lock held sections. However * after hash map was fully converted to use bpf_mem_alloc, there will be * non-synchronous memory allocation for non-preallocated hash map, so it is * safe to always use raw spinlock for bucket lock. */ struct bucket { struct hlist_nulls_head head; rqspinlock_t raw_lock; }; #define HASHTAB_MAP_LOCK_COUNT 8 #define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1) struct bpf_htab { struct bpf_map map; struct bpf_mem_alloc ma; struct bpf_mem_alloc pcpu_ma; struct bucket *buckets; void *elems; union { struct pcpu_freelist freelist; struct bpf_lru lru; }; struct htab_elem *__percpu *extra_elems; /* number of elements in non-preallocated hashtable are kept * in either pcount or count */ struct percpu_counter pcount; atomic_t count; bool use_percpu_counter; u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ u32 hashrnd; }; /* each htab element is struct htab_elem + key + value */ struct htab_elem { union { struct hlist_nulls_node hash_node; struct { void *padding; union { struct pcpu_freelist_node fnode; struct htab_elem *batch_flink; }; }; }; union { /* pointer to per-cpu pointer */ void *ptr_to_pptr; struct bpf_lru_node lru_node; }; u32 hash; char key[] __aligned(8); }; static inline bool htab_is_prealloc(const struct bpf_htab *htab) { return !(htab->map.map_flags & BPF_F_NO_PREALLOC); } static void htab_init_buckets(struct bpf_htab *htab) { unsigned int i; for (i = 0; i < htab->n_buckets; i++) { INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); raw_res_spin_lock_init(&htab->buckets[i].raw_lock); cond_resched(); } } static inline int htab_lock_bucket(struct bucket *b, unsigned long *pflags) { unsigned long flags; int ret; ret = raw_res_spin_lock_irqsave(&b->raw_lock, flags); if (ret) return ret; *pflags = flags; return 0; } static inline void htab_unlock_bucket(struct bucket *b, unsigned long flags) { raw_res_spin_unlock_irqrestore(&b->raw_lock, flags); } static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); static bool htab_is_lru(const struct bpf_htab *htab) { return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH || htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; } static bool htab_is_percpu(const struct bpf_htab *htab) { return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH || htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; } static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, void __percpu *pptr) { *(void __percpu **)(l->key + roundup(key_size, 8)) = pptr; } static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) { return *(void __percpu **)(l->key + roundup(key_size, 8)); } static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l) { return *(void **)(l->key + roundup(map->key_size, 8)); } static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) { return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size); } static bool htab_has_extra_elems(struct bpf_htab *htab) { return !htab_is_percpu(htab) && !htab_is_lru(htab); } static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab) { u32 num_entries = htab->map.max_entries; int i; if (htab_has_extra_elems(htab)) num_entries += num_possible_cpus(); for (i = 0; i < num_entries; i++) { struct htab_elem *elem; elem = get_htab_elem(htab, i); if (btf_record_has_field(htab->map.record, BPF_TIMER)) bpf_obj_free_timer(htab->map.record, elem->key + round_up(htab->map.key_size, 8)); if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) bpf_obj_free_workqueue(htab->map.record, elem->key + round_up(htab->map.key_size, 8)); cond_resched(); } } static void htab_free_prealloced_fields(struct bpf_htab *htab) { u32 num_entries = htab->map.max_entries; int i; if (IS_ERR_OR_NULL(htab->map.record)) return; if (htab_has_extra_elems(htab)) num_entries += num_possible_cpus(); for (i = 0; i < num_entries; i++) { struct htab_elem *elem; elem = get_htab_elem(htab, i); if (htab_is_percpu(htab)) { void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size); int cpu; for_each_possible_cpu(cpu) { bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu)); cond_resched(); } } else { bpf_obj_free_fields(htab->map.record, elem->key + round_up(htab->map.key_size, 8)); cond_resched(); } cond_resched(); } } static void htab_free_elems(struct bpf_htab *htab) { int i; if (!htab_is_percpu(htab)) goto free_elems; for (i = 0; i < htab->map.max_entries; i++) { void __percpu *pptr; pptr = htab_elem_get_ptr(get_htab_elem(htab, i), htab->map.key_size); free_percpu(pptr); cond_resched(); } free_elems: bpf_map_area_free(htab->elems); } /* The LRU list has a lock (lru_lock). Each htab bucket has a lock * (bucket_lock). If both locks need to be acquired together, the lock * order is always lru_lock -> bucket_lock and this only happens in * bpf_lru_list.c logic. For example, certain code path of * bpf_lru_pop_free(), which is called by function prealloc_lru_pop(), * will acquire lru_lock first followed by acquiring bucket_lock. * * In hashtab.c, to avoid deadlock, lock acquisition of * bucket_lock followed by lru_lock is not allowed. In such cases, * bucket_lock needs to be released first before acquiring lru_lock. */ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, u32 hash) { struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash); struct htab_elem *l; if (node) { bpf_map_inc_elem_count(&htab->map); l = container_of(node, struct htab_elem, lru_node); memcpy(l->key, key, htab->map.key_size); return l; } return NULL; } static int prealloc_init(struct bpf_htab *htab) { u32 num_entries = htab->map.max_entries; int err = -ENOMEM, i; if (htab_has_extra_elems(htab)) num_entries += num_possible_cpus(); htab->elems = bpf_map_area_alloc((u64)htab->elem_size * num_entries, htab->map.numa_node); if (!htab->elems) return -ENOMEM; if (!htab_is_percpu(htab)) goto skip_percpu_elems; for (i = 0; i < num_entries; i++) { u32 size = round_up(htab->map.value_size, 8); void __percpu *pptr; pptr = bpf_map_alloc_percpu(&htab->map, size, 8, GFP_USER | __GFP_NOWARN); if (!pptr) goto free_elems; htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size, pptr); cond_resched(); } skip_percpu_elems: if (htab_is_lru(htab)) err = bpf_lru_init(&htab->lru, htab->map.map_flags & BPF_F_NO_COMMON_LRU, offsetof(struct htab_elem, hash) - offsetof(struct htab_elem, lru_node), htab_lru_map_delete_node, htab); else err = pcpu_freelist_init(&htab->freelist); if (err) goto free_elems; if (htab_is_lru(htab)) bpf_lru_populate(&htab->lru, htab->elems, offsetof(struct htab_elem, lru_node), htab->elem_size, num_entries); else pcpu_freelist_populate(&htab->freelist, htab->elems + offsetof(struct htab_elem, fnode), htab->elem_size, num_entries); return 0; free_elems: htab_free_elems(htab); return err; } static void prealloc_destroy(struct bpf_htab *htab) { htab_free_elems(htab); if (htab_is_lru(htab)) bpf_lru_destroy(&htab->lru); else pcpu_freelist_destroy(&htab->freelist); } static int alloc_extra_elems(struct bpf_htab *htab) { struct htab_elem *__percpu *pptr, *l_new; struct pcpu_freelist_node *l; int cpu; pptr = bpf_map_alloc_percpu(&htab->map, sizeof(struct htab_elem *), 8, GFP_USER | __GFP_NOWARN); if (!pptr) return -ENOMEM; for_each_possible_cpu(cpu) { l = pcpu_freelist_pop(&htab->freelist); /* pop will succeed, since prealloc_init() * preallocated extra num_possible_cpus elements */ l_new = container_of(l, struct htab_elem, fnode); *per_cpu_ptr(pptr, cpu) = l_new; } htab->extra_elems = pptr; return 0; } /* Called from syscall */ static int htab_map_alloc_check(union bpf_attr *attr) { bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); /* percpu_lru means each cpu has its own LRU list. * it is different from BPF_MAP_TYPE_PERCPU_HASH where * the map's value itself is percpu. percpu_lru has * nothing to do with the map's value. */ bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED); int numa_node = bpf_map_attr_numa_node(attr); BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != offsetof(struct htab_elem, hash_node.pprev)); if (zero_seed && !capable(CAP_SYS_ADMIN)) /* Guard against local DoS, and discourage production use. */ return -EPERM; if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK || !bpf_map_flags_access_ok(attr->map_flags)) return -EINVAL; if (!lru && percpu_lru) return -EINVAL; if (lru && !prealloc) return -ENOTSUPP; if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru)) return -EINVAL; /* check sanity of attributes. * value_size == 0 may be allowed in the future to use map as a set */ if (attr->max_entries == 0 || attr->key_size == 0 || attr->value_size == 0) return -EINVAL; if ((u64)attr->key_size + attr->value_size >= KMALLOC_MAX_SIZE - sizeof(struct htab_elem)) /* if key_size + value_size is bigger, the user space won't be * able to access the elements via bpf syscall. This check * also makes sure that the elem_size doesn't overflow and it's * kmalloc-able later in htab_map_update_elem() */ return -E2BIG; /* percpu map value size is bound by PCPU_MIN_UNIT_SIZE */ if (percpu && round_up(attr->value_size, 8) > PCPU_MIN_UNIT_SIZE) return -E2BIG; return 0; } static struct bpf_map *htab_map_alloc(union bpf_attr *attr) { bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); /* percpu_lru means each cpu has its own LRU list. * it is different from BPF_MAP_TYPE_PERCPU_HASH where * the map's value itself is percpu. percpu_lru has * nothing to do with the map's value. */ bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); struct bpf_htab *htab; int err; htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE); if (!htab) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&htab->map, attr); if (percpu_lru) { /* ensure each CPU's lru list has >=1 elements. * since we are at it, make each lru list has the same * number of elements. */ htab->map.max_entries = roundup(attr->max_entries, num_possible_cpus()); if (htab->map.max_entries < attr->max_entries) htab->map.max_entries = rounddown(attr->max_entries, num_possible_cpus()); } /* hash table size must be power of 2; roundup_pow_of_two() can overflow * into UB on 32-bit arches, so check that first */ err = -E2BIG; if (htab->map.max_entries > 1UL << 31) goto free_htab; htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); htab->elem_size = sizeof(struct htab_elem) + round_up(htab->map.key_size, 8); if (percpu) htab->elem_size += sizeof(void *); else htab->elem_size += round_up(htab->map.value_size, 8); /* check for u32 overflow */ if (htab->n_buckets > U32_MAX / sizeof(struct bucket)) goto free_htab; err = bpf_map_init_elem_count(&htab->map); if (err) goto free_htab; err = -ENOMEM; htab->buckets = bpf_map_area_alloc(htab->n_buckets * sizeof(struct bucket), htab->map.numa_node); if (!htab->buckets) goto free_elem_count; if (htab->map.map_flags & BPF_F_ZERO_SEED) htab->hashrnd = 0; else htab->hashrnd = get_random_u32(); htab_init_buckets(htab); /* compute_batch_value() computes batch value as num_online_cpus() * 2 * and __percpu_counter_compare() needs * htab->max_entries - cur_number_of_elems to be more than batch * num_online_cpus() * for percpu_counter to be faster than atomic_t. In practice the average bpf * hash map size is 10k, which means that a system with 64 cpus will fill * hashmap to 20% of 10k before percpu_counter becomes ineffective. Therefore * define our own batch count as 32 then 10k hash map can be filled up to 80%: * 10k - 8k > 32 _batch_ * 64 _cpus_ * and __percpu_counter_compare() will still be fast. At that point hash map * collisions will dominate its performance anyway. Assume that hash map filled * to 50+% isn't going to be O(1) and use the following formula to choose * between percpu_counter and atomic_t. */ #define PERCPU_COUNTER_BATCH 32 if (attr->max_entries / 2 > num_online_cpus() * PERCPU_COUNTER_BATCH) htab->use_percpu_counter = true; if (htab->use_percpu_counter) { err = percpu_counter_init(&htab->pcount, 0, GFP_KERNEL); if (err) goto free_map_locked; } if (prealloc) { err = prealloc_init(htab); if (err) goto free_map_locked; if (!percpu && !lru) { /* lru itself can remove the least used element, so * there is no need for an extra elem during map_update. */ err = alloc_extra_elems(htab); if (err) goto free_prealloc; } } else { err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false); if (err) goto free_map_locked; if (percpu) { err = bpf_mem_alloc_init(&htab->pcpu_ma, round_up(htab->map.value_size, 8), true); if (err) goto free_map_locked; } } return &htab->map; free_prealloc: prealloc_destroy(htab); free_map_locked: if (htab->use_percpu_counter) percpu_counter_destroy(&htab->pcount); bpf_map_area_free(htab->buckets); bpf_mem_alloc_destroy(&htab->pcpu_ma); bpf_mem_alloc_destroy(&htab->ma); free_elem_count: bpf_map_free_elem_count(&htab->map); free_htab: bpf_map_area_free(htab); return ERR_PTR(err); } static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd) { if (likely(key_len % 4 == 0)) return jhash2(key, key_len / 4, hashrnd); return jhash(key, key_len, hashrnd); } static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) { return &htab->buckets[hash & (htab->n_buckets - 1)]; } static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 hash) { return &__select_bucket(htab, hash)->head; } /* this lookup function can only be called with bucket lock taken */ static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash, void *key, u32 key_size) { struct hlist_nulls_node *n; struct htab_elem *l; hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) if (l->hash == hash && !memcmp(&l->key, key, key_size)) return l; return NULL; } /* can be called without bucket lock. it will repeat the loop in * the unlikely event when elements moved from one bucket into another * while link list is being walked */ static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head, u32 hash, void *key, u32 key_size, u32 n_buckets) { struct hlist_nulls_node *n; struct htab_elem *l; again: hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) if (l->hash == hash && !memcmp(&l->key, key, key_size)) return l; if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1)))) goto again; return NULL; } /* Called from syscall or from eBPF program directly, so * arguments have to match bpf_map_lookup_elem() exactly. * The return value is adjusted by BPF instructions * in htab_map_gen_lookup(). */ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct hlist_nulls_head *head; struct htab_elem *l; u32 hash, key_size; WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && !rcu_read_lock_bh_held()); key_size = map->key_size; hash = htab_map_hash(key, key_size, htab->hashrnd); head = select_bucket(htab, hash); l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); return l; } static void *htab_map_lookup_elem(struct bpf_map *map, void *key) { struct htab_elem *l = __htab_map_lookup_elem(map, key); if (l) return l->key + round_up(map->key_size, 8); return NULL; } /* inline bpf_map_lookup_elem() call. * Instead of: * bpf_prog * bpf_map_lookup_elem * map->ops->map_lookup_elem * htab_map_lookup_elem * __htab_map_lookup_elem * do: * bpf_prog * __htab_map_lookup_elem */ static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_insn *insn = insn_buf; const int ret = BPF_REG_0; BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, (void *(*)(struct bpf_map *map, void *key))NULL)); *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem); *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1); *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, offsetof(struct htab_elem, key) + round_up(map->key_size, 8)); return insn - insn_buf; } static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map, void *key, const bool mark) { struct htab_elem *l = __htab_map_lookup_elem(map, key); if (l) { if (mark) bpf_lru_node_set_ref(&l->lru_node); return l->key + round_up(map->key_size, 8); } return NULL; } static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) { return __htab_lru_map_lookup_elem(map, key, true); } static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key) { return __htab_lru_map_lookup_elem(map, key, false); } static int htab_lru_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_insn *insn = insn_buf; const int ret = BPF_REG_0; const int ref_reg = BPF_REG_1; BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, (void *(*)(struct bpf_map *map, void *key))NULL)); *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem); *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4); *insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret, offsetof(struct htab_elem, lru_node) + offsetof(struct bpf_lru_node, ref)); *insn++ = BPF_JMP_IMM(BPF_JNE, ref_reg, 0, 1); *insn++ = BPF_ST_MEM(BPF_B, ret, offsetof(struct htab_elem, lru_node) + offsetof(struct bpf_lru_node, ref), 1); *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, offsetof(struct htab_elem, key) + round_up(map->key_size, 8)); return insn - insn_buf; } static void check_and_free_fields(struct bpf_htab *htab, struct htab_elem *elem) { if (IS_ERR_OR_NULL(htab->map.record)) return; if (htab_is_percpu(htab)) { void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size); int cpu; for_each_possible_cpu(cpu) bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu)); } else { void *map_value = elem->key + round_up(htab->map.key_size, 8); bpf_obj_free_fields(htab->map.record, map_value); } } /* It is called from the bpf_lru_list when the LRU needs to delete * older elements from the htab. */ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) { struct bpf_htab *htab = arg; struct htab_elem *l = NULL, *tgt_l; struct hlist_nulls_head *head; struct hlist_nulls_node *n; unsigned long flags; struct bucket *b; int ret; tgt_l = container_of(node, struct htab_elem, lru_node); b = __select_bucket(htab, tgt_l->hash); head = &b->head; ret = htab_lock_bucket(b, &flags); if (ret) return false; hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) if (l == tgt_l) { hlist_nulls_del_rcu(&l->hash_node); bpf_map_dec_elem_count(&htab->map); break; } htab_unlock_bucket(b, flags); if (l == tgt_l) check_and_free_fields(htab, l); return l == tgt_l; } /* Called from syscall */ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct hlist_nulls_head *head; struct htab_elem *l, *next_l; u32 hash, key_size; int i = 0; WARN_ON_ONCE(!rcu_read_lock_held()); key_size = map->key_size; if (!key) goto find_first_elem; hash = htab_map_hash(key, key_size, htab->hashrnd); head = select_bucket(htab, hash); /* lookup the key */ l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); if (!l) goto find_first_elem; /* key was found, get next key in the same bucket */ next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)), struct htab_elem, hash_node); if (next_l) { /* if next elem in this hash list is non-zero, just return it */ memcpy(next_key, next_l->key, key_size); return 0; } /* no more elements in this hash list, go to the next bucket */ i = hash & (htab->n_buckets - 1); i++; find_first_elem: /* iterate over buckets */ for (; i < htab->n_buckets; i++) { head = select_bucket(htab, i); /* pick first element in the bucket */ next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)), struct htab_elem, hash_node); if (next_l) { /* if it's not empty, just return it */ memcpy(next_key, next_l->key, key_size); return 0; } } /* iterated over all buckets and all elements */ return -ENOENT; } static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) { check_and_free_fields(htab, l); if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr); bpf_mem_cache_free(&htab->ma, l); } static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l) { struct bpf_map *map = &htab->map; void *ptr; if (map->ops->map_fd_put_ptr) { ptr = fd_htab_map_get_ptr(map, l); map->ops->map_fd_put_ptr(map, ptr, true); } } static bool is_map_full(struct bpf_htab *htab) { if (htab->use_percpu_counter) return __percpu_counter_compare(&htab->pcount, htab->map.max_entries, PERCPU_COUNTER_BATCH) >= 0; return atomic_read(&htab->count) >= htab->map.max_entries; } static void inc_elem_count(struct bpf_htab *htab) { bpf_map_inc_elem_count(&htab->map); if (htab->use_percpu_counter) percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH); else atomic_inc(&htab->count); } static void dec_elem_count(struct bpf_htab *htab) { bpf_map_dec_elem_count(&htab->map); if (htab->use_percpu_counter) percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH); else atomic_dec(&htab->count); } static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) { htab_put_fd_value(htab, l); if (htab_is_prealloc(htab)) { bpf_map_dec_elem_count(&htab->map); check_and_free_fields(htab, l); pcpu_freelist_push(&htab->freelist, &l->fnode); } else { dec_elem_count(htab); htab_elem_free(htab, l); } } static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, void *value, bool onallcpus) { if (!onallcpus) { /* copy true value_size bytes */ copy_map_value(&htab->map, this_cpu_ptr(pptr), value); } else { u32 size = round_up(htab->map.value_size, 8); int off = 0, cpu; for_each_possible_cpu(cpu) { copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value + off); off += size; } } } static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, void *value, bool onallcpus) { /* When not setting the initial value on all cpus, zero-fill element * values for other cpus. Otherwise, bpf program has no way to ensure * known initial values for cpus other than current one * (onallcpus=false always when coming from bpf prog). */ if (!onallcpus) { int current_cpu = raw_smp_processor_id(); int cpu; for_each_possible_cpu(cpu) { if (cpu == current_cpu) copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value); else /* Since elem is preallocated, we cannot touch special fields */ zero_map_value(&htab->map, per_cpu_ptr(pptr, cpu)); } } else { pcpu_copy_value(htab, pptr, value, onallcpus); } } static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) { return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS && BITS_PER_LONG == 64; } static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, void *value, u32 key_size, u32 hash, bool percpu, bool onallcpus, struct htab_elem *old_elem) { u32 size = htab->map.value_size; bool prealloc = htab_is_prealloc(htab); struct htab_elem *l_new, **pl_new; void __percpu *pptr; if (prealloc) { if (old_elem) { /* if we're updating the existing element, * use per-cpu extra elems to avoid freelist_pop/push */ pl_new = this_cpu_ptr(htab->extra_elems); l_new = *pl_new; *pl_new = old_elem; } else { struct pcpu_freelist_node *l; l = __pcpu_freelist_pop(&htab->freelist); if (!l) return ERR_PTR(-E2BIG); l_new = container_of(l, struct htab_elem, fnode); bpf_map_inc_elem_count(&htab->map); } } else { if (is_map_full(htab)) if (!old_elem) /* when map is full and update() is replacing * old element, it's ok to allocate, since * old element will be freed immediately. * Otherwise return an error */ return ERR_PTR(-E2BIG); inc_elem_count(htab); l_new = bpf_mem_cache_alloc(&htab->ma); if (!l_new) { l_new = ERR_PTR(-ENOMEM); goto dec_count; } } memcpy(l_new->key, key, key_size); if (percpu) { if (prealloc) { pptr = htab_elem_get_ptr(l_new, key_size); } else { /* alloc_percpu zero-fills */ void *ptr = bpf_mem_cache_alloc(&htab->pcpu_ma); if (!ptr) { bpf_mem_cache_free(&htab->ma, l_new); l_new = ERR_PTR(-ENOMEM); goto dec_count; } l_new->ptr_to_pptr = ptr; pptr = *(void __percpu **)ptr; } pcpu_init_value(htab, pptr, value, onallcpus); if (!prealloc) htab_elem_set_ptr(l_new, key_size, pptr); } else if (fd_htab_map_needs_adjust(htab)) { size = round_up(size, 8); memcpy(l_new->key + round_up(key_size, 8), value, size); } else { copy_map_value(&htab->map, l_new->key + round_up(key_size, 8), value); } l_new->hash = hash; return l_new; dec_count: dec_elem_count(htab); return l_new; } static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, u64 map_flags) { if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST) /* elem already exists */ return -EEXIST; if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST) /* elem doesn't exist, cannot update it */ return -ENOENT; return 0; } /* Called from syscall or from eBPF program */ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new = NULL, *l_old; struct hlist_nulls_head *head; unsigned long flags; void *old_map_ptr; struct bucket *b; u32 key_size, hash; int ret; if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST)) /* unknown flags */ return -EINVAL; WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && !rcu_read_lock_bh_held()); key_size = map->key_size; hash = htab_map_hash(key, key_size, htab->hashrnd); b = __select_bucket(htab, hash); head = &b->head; if (unlikely(map_flags & BPF_F_LOCK)) { if (unlikely(!btf_record_has_field(map->record, BPF_SPIN_LOCK))) return -EINVAL; /* find an element without taking the bucket lock */ l_old = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); ret = check_flags(htab, l_old, map_flags); if (ret) return ret; if (l_old) { /* grab the element lock and update value in place */ copy_map_value_locked(map, l_old->key + round_up(key_size, 8), value, false); return 0; } /* fall through, grab the bucket lock and lookup again. * 99.9% chance that the element won't be found, * but second lookup under lock has to be done. */ } ret = htab_lock_bucket(b, &flags); if (ret) return ret; l_old = lookup_elem_raw(head, hash, key, key_size); ret = check_flags(htab, l_old, map_flags); if (ret) goto err; if (unlikely(l_old && (map_flags & BPF_F_LOCK))) { /* first lookup without the bucket lock didn't find the element, * but second lookup with the bucket lock found it. * This case is highly unlikely, but has to be dealt with: * grab the element lock in addition to the bucket lock * and update element in place */ copy_map_value_locked(map, l_old->key + round_up(key_size, 8), value, false); ret = 0; goto err; } l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, l_old); if (IS_ERR(l_new)) { /* all pre-allocated elements are in use or memory exhausted */ ret = PTR_ERR(l_new); goto err; } /* add new element to the head of the list, so that * concurrent search will find it before old elem */ hlist_nulls_add_head_rcu(&l_new->hash_node, head); if (l_old) { hlist_nulls_del_rcu(&l_old->hash_node); /* l_old has already been stashed in htab->extra_elems, free * its special fields before it is available for reuse. Also * save the old map pointer in htab of maps before unlock * and release it after unlock. */ old_map_ptr = NULL; if (htab_is_prealloc(htab)) { if (map->ops->map_fd_put_ptr) old_map_ptr = fd_htab_map_get_ptr(map, l_old); check_and_free_fields(htab, l_old); } } htab_unlock_bucket(b, flags); if (l_old) { if (old_map_ptr) map->ops->map_fd_put_ptr(map, old_map_ptr, true); if (!htab_is_prealloc(htab)) free_htab_elem(htab, l_old); } return 0; err: htab_unlock_bucket(b, flags); return ret; } static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem) { check_and_free_fields(htab, elem); bpf_map_dec_elem_count(&htab->map); bpf_lru_push_free(&htab->lru, &elem->lru_node); } static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new, *l_old = NULL; struct hlist_nulls_head *head; unsigned long flags; struct bucket *b; u32 key_size, hash; int ret; if (unlikely(map_flags > BPF_EXIST)) /* unknown flags */ return -EINVAL; WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && !rcu_read_lock_bh_held()); key_size = map->key_size; hash = htab_map_hash(key, key_size, htab->hashrnd); b = __select_bucket(htab, hash); head = &b->head; /* For LRU, we need to alloc before taking bucket's * spinlock because getting free nodes from LRU may need * to remove older elements from htab and this removal * operation will need a bucket lock. */ l_new = prealloc_lru_pop(htab, key, hash); if (!l_new) return -ENOMEM; copy_map_value(&htab->map, l_new->key + round_up(map->key_size, 8), value); ret = htab_lock_bucket(b, &flags); if (ret) goto err_lock_bucket; l_old = lookup_elem_raw(head, hash, key, key_size); ret = check_flags(htab, l_old, map_flags); if (ret) goto err; /* add new element to the head of the list, so that * concurrent search will find it before old elem */ hlist_nulls_add_head_rcu(&l_new->hash_node, head); if (l_old) { bpf_lru_node_set_ref(&l_new->lru_node); hlist_nulls_del_rcu(&l_old->hash_node); } ret = 0; err: htab_unlock_bucket(b, flags); err_lock_bucket: if (ret) htab_lru_push_free(htab, l_new); else if (l_old) htab_lru_push_free(htab, l_old); return ret; } static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags, bool onallcpus) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new = NULL, *l_old; struct hlist_nulls_head *head; unsigned long flags; struct bucket *b; u32 key_size, hash; int ret; if (unlikely(map_flags > BPF_EXIST)) /* unknown flags */ return -EINVAL; WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && !rcu_read_lock_bh_held()); key_size = map->key_size; hash = htab_map_hash(key, key_size, htab->hashrnd); b = __select_bucket(htab, hash); head = &b->head; ret = htab_lock_bucket(b, &flags); if (ret) return ret; l_old = lookup_elem_raw(head, hash, key, key_size); ret = check_flags(htab, l_old, map_flags); if (ret) goto err; if (l_old) { /* per-cpu hash map can update value in-place */ pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), value, onallcpus); } else { l_new = alloc_htab_elem(htab, key, value, key_size, hash, true, onallcpus, NULL); if (IS_ERR(l_new)) { ret = PTR_ERR(l_new); goto err; } hlist_nulls_add_head_rcu(&l_new->hash_node, head); } ret = 0; err: htab_unlock_bucket(b, flags); return ret; } static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags, bool onallcpus) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new = NULL, *l_old; struct hlist_nulls_head *head; unsigned long flags; struct bucket *b; u32 key_size, hash; int ret; if (unlikely(map_flags > BPF_EXIST)) /* unknown flags */ return -EINVAL; WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && !rcu_read_lock_bh_held()); key_size = map->key_size; hash = htab_map_hash(key, key_size, htab->hashrnd); b = __select_bucket(htab, hash); head = &b->head; /* For LRU, we need to alloc before taking bucket's * spinlock because LRU's elem alloc may need * to remove older elem from htab and this removal * operation will need a bucket lock. */ if (map_flags != BPF_EXIST) { l_new = prealloc_lru_pop(htab, key, hash); if (!l_new) return -ENOMEM; } ret = htab_lock_bucket(b, &flags); if (ret) goto err_lock_bucket; l_old = lookup_elem_raw(head, hash, key, key_size); ret = check_flags(htab, l_old, map_flags); if (ret) goto err; if (l_old) { bpf_lru_node_set_ref(&l_old->lru_node); /* per-cpu hash map can update value in-place */ pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), value, onallcpus); } else { pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size), value, onallcpus); hlist_nulls_add_head_rcu(&l_new->hash_node, head); l_new = NULL; } ret = 0; err: htab_unlock_bucket(b, flags); err_lock_bucket: if (l_new) { bpf_map_dec_elem_count(&htab->map); bpf_lru_push_free(&htab->lru, &l_new->lru_node); } return ret; } static long htab_percpu_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { return __htab_percpu_map_update_elem(map, key, value, map_flags, false); } static long htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { return __htab_lru_percpu_map_update_elem(map, key, value, map_flags, false); } /* Called from syscall or from eBPF program */ static long htab_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct hlist_nulls_head *head; struct bucket *b; struct htab_elem *l; unsigned long flags; u32 hash, key_size; int ret; WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && !rcu_read_lock_bh_held()); key_size = map->key_size; hash = htab_map_hash(key, key_size, htab->hashrnd); b = __select_bucket(htab, hash); head = &b->head; ret = htab_lock_bucket(b, &flags); if (ret) return ret; l = lookup_elem_raw(head, hash, key, key_size); if (l) hlist_nulls_del_rcu(&l->hash_node); else ret = -ENOENT; htab_unlock_bucket(b, flags); if (l) free_htab_elem(htab, l); return ret; } static long htab_lru_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct hlist_nulls_head *head; struct bucket *b; struct htab_elem *l; unsigned long flags; u32 hash, key_size; int ret; WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && !rcu_read_lock_bh_held()); key_size = map->key_size; hash = htab_map_hash(key, key_size, htab->hashrnd); b = __select_bucket(htab, hash); head = &b->head; ret = htab_lock_bucket(b, &flags); if (ret) return ret; l = lookup_elem_raw(head, hash, key, key_size); if (l) hlist_nulls_del_rcu(&l->hash_node); else ret = -ENOENT; htab_unlock_bucket(b, flags); if (l) htab_lru_push_free(htab, l); return ret; } static void delete_all_elements(struct bpf_htab *htab) { int i; /* It's called from a worker thread and migration has been disabled, * therefore, it is OK to invoke bpf_mem_cache_free() directly. */ for (i = 0; i < htab->n_buckets; i++) { struct hlist_nulls_head *head = select_bucket(htab, i); struct hlist_nulls_node *n; struct htab_elem *l; hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { hlist_nulls_del_rcu(&l->hash_node); htab_elem_free(htab, l); } cond_resched(); } } static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab) { int i; rcu_read_lock(); for (i = 0; i < htab->n_buckets; i++) { struct hlist_nulls_head *head = select_bucket(htab, i); struct hlist_nulls_node *n; struct htab_elem *l; hlist_nulls_for_each_entry(l, n, head, hash_node) { /* We only free timer on uref dropping to zero */ if (btf_record_has_field(htab->map.record, BPF_TIMER)) bpf_obj_free_timer(htab->map.record, l->key + round_up(htab->map.key_size, 8)); if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) bpf_obj_free_workqueue(htab->map.record, l->key + round_up(htab->map.key_size, 8)); } cond_resched_rcu(); } rcu_read_unlock(); } static void htab_map_free_timers_and_wq(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); /* We only free timer and workqueue on uref dropping to zero */ if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE)) { if (!htab_is_prealloc(htab)) htab_free_malloced_timers_and_wq(htab); else htab_free_prealloced_timers_and_wq(htab); } } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ static void htab_map_free(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback. * bpf_free_used_maps() is called after bpf prog is no longer executing. * There is no need to synchronize_rcu() here to protect map elements. */ /* htab no longer uses call_rcu() directly. bpf_mem_alloc does it * underneath and is responsible for waiting for callbacks to finish * during bpf_mem_alloc_destroy(). */ if (!htab_is_prealloc(htab)) { delete_all_elements(htab); } else { htab_free_prealloced_fields(htab); prealloc_destroy(htab); } bpf_map_free_elem_count(map); free_percpu(htab->extra_elems); bpf_map_area_free(htab->buckets); bpf_mem_alloc_destroy(&htab->pcpu_ma); bpf_mem_alloc_destroy(&htab->ma); if (htab->use_percpu_counter) percpu_counter_destroy(&htab->pcount); bpf_map_area_free(htab); } static void htab_map_seq_show_elem(struct bpf_map *map, void *key, struct seq_file *m) { void *value; rcu_read_lock(); value = htab_map_lookup_elem(map, key); if (!value) { rcu_read_unlock(); return; } btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); seq_puts(m, ": "); btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); seq_putc(m, '\n'); rcu_read_unlock(); } static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, void *value, bool is_lru_map, bool is_percpu, u64 flags) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct hlist_nulls_head *head; unsigned long bflags; struct htab_elem *l; u32 hash, key_size; struct bucket *b; int ret; key_size = map->key_size; hash = htab_map_hash(key, key_size, htab->hashrnd); b = __select_bucket(htab, hash); head = &b->head; ret = htab_lock_bucket(b, &bflags); if (ret) return ret; l = lookup_elem_raw(head, hash, key, key_size); if (!l) { ret = -ENOENT; goto out_unlock; } if (is_percpu) { u32 roundup_value_size = round_up(map->value_size, 8); void __percpu *pptr; int off = 0, cpu; pptr = htab_elem_get_ptr(l, key_size); for_each_possible_cpu(cpu) { copy_map_value_long(&htab->map, value + off, per_cpu_ptr(pptr, cpu)); check_and_init_map_value(&htab->map, value + off); off += roundup_value_size; } } else { u32 roundup_key_size = round_up(map->key_size, 8); if (flags & BPF_F_LOCK) copy_map_value_locked(map, value, l->key + roundup_key_size, true); else copy_map_value(map, value, l->key + roundup_key_size); /* Zeroing special fields in the temp buffer */ check_and_init_map_value(map, value); } hlist_nulls_del_rcu(&l->hash_node); out_unlock: htab_unlock_bucket(b, bflags); if (l) { if (is_lru_map) htab_lru_push_free(htab, l); else free_htab_elem(htab, l); } return ret; } static int htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, void *value, u64 flags) { return __htab_map_lookup_and_delete_elem(map, key, value, false, false, flags); } static int htab_percpu_map_lookup_and_delete_elem(struct bpf_map *map, void *key, void *value, u64 flags) { return __htab_map_lookup_and_delete_elem(map, key, value, false, true, flags); } static int htab_lru_map_lookup_and_delete_elem(struct bpf_map *map, void *key, void *value, u64 flags) { return __htab_map_lookup_and_delete_elem(map, key, value, true, false, flags); } static int htab_lru_percpu_map_lookup_and_delete_elem(struct bpf_map *map, void *key, void *value, u64 flags) { return __htab_map_lookup_and_delete_elem(map, key, value, true, true, flags); } static int __htab_map_lookup_and_delete_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr, bool do_delete, bool is_lru_map, bool is_percpu) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); u32 bucket_cnt, total, key_size, value_size, roundup_key_size; void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val; void __user *uvalues = u64_to_user_ptr(attr->batch.values); void __user *ukeys = u64_to_user_ptr(attr->batch.keys); void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); u32 batch, max_count, size, bucket_size, map_id; struct htab_elem *node_to_free = NULL; u64 elem_map_flags, map_flags; struct hlist_nulls_head *head; struct hlist_nulls_node *n; unsigned long flags = 0; bool locked = false; struct htab_elem *l; struct bucket *b; int ret = 0; elem_map_flags = attr->batch.elem_flags; if ((elem_map_flags & ~BPF_F_LOCK) || ((elem_map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK))) return -EINVAL; map_flags = attr->batch.flags; if (map_flags) return -EINVAL; max_count = attr->batch.count; if (!max_count) return 0; if (put_user(0, &uattr->batch.count)) return -EFAULT; batch = 0; if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch))) return -EFAULT; if (batch >= htab->n_buckets) return -ENOENT; key_size = htab->map.key_size; roundup_key_size = round_up(htab->map.key_size, 8); value_size = htab->map.value_size; size = round_up(value_size, 8); if (is_percpu) value_size = size * num_possible_cpus(); total = 0; /* while experimenting with hash tables with sizes ranging from 10 to * 1000, it was observed that a bucket can have up to 5 entries. */ bucket_size = 5; alloc: /* We cannot do copy_from_user or copy_to_user inside * the rcu_read_lock. Allocate enough space here. */ keys = kvmalloc_array(key_size, bucket_size, GFP_USER | __GFP_NOWARN); values = kvmalloc_array(value_size, bucket_size, GFP_USER | __GFP_NOWARN); if (!keys || !values) { ret = -ENOMEM; goto after_loop; } again: bpf_disable_instrumentation(); rcu_read_lock(); again_nocopy: dst_key = keys; dst_val = values; b = &htab->buckets[batch]; head = &b->head; /* do not grab the lock unless need it (bucket_cnt > 0). */ if (locked) { ret = htab_lock_bucket(b, &flags); if (ret) { rcu_read_unlock(); bpf_enable_instrumentation(); goto after_loop; } } bucket_cnt = 0; hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) bucket_cnt++; if (bucket_cnt && !locked) { locked = true; goto again_nocopy; } if (bucket_cnt > (max_count - total)) { if (total == 0) ret = -ENOSPC; /* Note that since bucket_cnt > 0 here, it is implicit * that the locked was grabbed, so release it. */ htab_unlock_bucket(b, flags); rcu_read_unlock(); bpf_enable_instrumentation(); goto after_loop; } if (bucket_cnt > bucket_size) { bucket_size = bucket_cnt; /* Note that since bucket_cnt > 0 here, it is implicit * that the locked was grabbed, so release it. */ htab_unlock_bucket(b, flags); rcu_read_unlock(); bpf_enable_instrumentation(); kvfree(keys); kvfree(values); goto alloc; } /* Next block is only safe to run if you have grabbed the lock */ if (!locked) goto next_batch; hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { memcpy(dst_key, l->key, key_size); if (is_percpu) { int off = 0, cpu; void __percpu *pptr; pptr = htab_elem_get_ptr(l, map->key_size); for_each_possible_cpu(cpu) { copy_map_value_long(&htab->map, dst_val + off, per_cpu_ptr(pptr, cpu)); check_and_init_map_value(&htab->map, dst_val + off); off += size; } } else { value = l->key + roundup_key_size; if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { struct bpf_map **inner_map = value; /* Actual value is the id of the inner map */ map_id = map->ops->map_fd_sys_lookup_elem(*inner_map); value = &map_id; } if (elem_map_flags & BPF_F_LOCK) copy_map_value_locked(map, dst_val, value, true); else copy_map_value(map, dst_val, value); /* Zeroing special fields in the temp buffer */ check_and_init_map_value(map, dst_val); } if (do_delete) { hlist_nulls_del_rcu(&l->hash_node); /* bpf_lru_push_free() will acquire lru_lock, which * may cause deadlock. See comments in function * prealloc_lru_pop(). Let us do bpf_lru_push_free() * after releasing the bucket lock. * * For htab of maps, htab_put_fd_value() in * free_htab_elem() may acquire a spinlock with bucket * lock being held and it violates the lock rule, so * invoke free_htab_elem() after unlock as well. */ l->batch_flink = node_to_free; node_to_free = l; } dst_key += key_size; dst_val += value_size; } htab_unlock_bucket(b, flags); locked = false; while (node_to_free) { l = node_to_free; node_to_free = node_to_free->batch_flink; if (is_lru_map) htab_lru_push_free(htab, l); else free_htab_elem(htab, l); } next_batch: /* If we are not copying data, we can go to next bucket and avoid * unlocking the rcu. */ if (!bucket_cnt && (batch + 1 < htab->n_buckets)) { batch++; goto again_nocopy; } rcu_read_unlock(); bpf_enable_instrumentation(); if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys, key_size * bucket_cnt) || copy_to_user(uvalues + total * value_size, values, value_size * bucket_cnt))) { ret = -EFAULT; goto after_loop; } total += bucket_cnt; batch++; if (batch >= htab->n_buckets) { ret = -ENOENT; goto after_loop; } goto again; after_loop: if (ret == -EFAULT) goto out; /* copy # of entries and next batch */ ubatch = u64_to_user_ptr(attr->batch.out_batch); if (copy_to_user(ubatch, &batch, sizeof(batch)) || put_user(total, &uattr->batch.count)) ret = -EFAULT; out: kvfree(keys); kvfree(values); return ret; } static int htab_percpu_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr) { return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, false, true); } static int htab_percpu_map_lookup_and_delete_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr) { return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, false, true); } static int htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr) { return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, false, false); } static int htab_map_lookup_and_delete_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr) { return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, false, false); } static int htab_lru_percpu_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr) { return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, true, true); } static int htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr) { return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, true, true); } static int htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr) { return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, true, false); } static int htab_lru_map_lookup_and_delete_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr) { return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, true, false); } struct bpf_iter_seq_hash_map_info { struct bpf_map *map; struct bpf_htab *htab; void *percpu_value_buf; // non-zero means percpu hash u32 bucket_id; u32 skip_elems; }; static struct htab_elem * bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info, struct htab_elem *prev_elem) { const struct bpf_htab *htab = info->htab; u32 skip_elems = info->skip_elems; u32 bucket_id = info->bucket_id; struct hlist_nulls_head *head; struct hlist_nulls_node *n; struct htab_elem *elem; struct bucket *b; u32 i, count; if (bucket_id >= htab->n_buckets) return NULL; /* try to find next elem in the same bucket */ if (prev_elem) { /* no update/deletion on this bucket, prev_elem should be still valid * and we won't skip elements. */ n = rcu_dereference_raw(hlist_nulls_next_rcu(&prev_elem->hash_node)); elem = hlist_nulls_entry_safe(n, struct htab_elem, hash_node); if (elem) return elem; /* not found, unlock and go to the next bucket */ b = &htab->buckets[bucket_id++]; rcu_read_unlock(); skip_elems = 0; } for (i = bucket_id; i < htab->n_buckets; i++) { b = &htab->buckets[i]; rcu_read_lock(); count = 0; head = &b->head; hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) { if (count >= skip_elems) { info->bucket_id = i; info->skip_elems = count; return elem; } count++; } rcu_read_unlock(); skip_elems = 0; } info->bucket_id = i; info->skip_elems = 0; return NULL; } static void *bpf_hash_map_seq_start(struct seq_file *seq, loff_t *pos) { struct bpf_iter_seq_hash_map_info *info = seq->private; struct htab_elem *elem; elem = bpf_hash_map_seq_find_next(info, NULL); if (!elem) return NULL; if (*pos == 0) ++*pos; return elem; } static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct bpf_iter_seq_hash_map_info *info = seq->private; ++*pos; ++info->skip_elems; return bpf_hash_map_seq_find_next(info, v); } static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem) { struct bpf_iter_seq_hash_map_info *info = seq->private; u32 roundup_key_size, roundup_value_size; struct bpf_iter__bpf_map_elem ctx = {}; struct bpf_map *map = info->map; struct bpf_iter_meta meta; int ret = 0, off = 0, cpu; struct bpf_prog *prog; void __percpu *pptr; meta.seq = seq; prog = bpf_iter_get_info(&meta, elem == NULL); if (prog) { ctx.meta = &meta; ctx.map = info->map; if (elem) { roundup_key_size = round_up(map->key_size, 8); ctx.key = elem->key; if (!info->percpu_value_buf) { ctx.value = elem->key + roundup_key_size; } else { roundup_value_size = round_up(map->value_size, 8); pptr = htab_elem_get_ptr(elem, map->key_size); for_each_possible_cpu(cpu) { copy_map_value_long(map, info->percpu_value_buf + off, per_cpu_ptr(pptr, cpu)); check_and_init_map_value(map, info->percpu_value_buf + off); off += roundup_value_size; } ctx.value = info->percpu_value_buf; } } ret = bpf_iter_run_prog(prog, &ctx); } return ret; } static int bpf_hash_map_seq_show(struct seq_file *seq, void *v) { return __bpf_hash_map_seq_show(seq, v); } static void bpf_hash_map_seq_stop(struct seq_file *seq, void *v) { if (!v) (void)__bpf_hash_map_seq_show(seq, NULL); else rcu_read_unlock(); } static int bpf_iter_init_hash_map(void *priv_data, struct bpf_iter_aux_info *aux) { struct bpf_iter_seq_hash_map_info *seq_info = priv_data; struct bpf_map *map = aux->map; void *value_buf; u32 buf_size; if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { buf_size = round_up(map->value_size, 8) * num_possible_cpus(); value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN); if (!value_buf) return -ENOMEM; seq_info->percpu_value_buf = value_buf; } bpf_map_inc_with_uref(map); seq_info->map = map; seq_info->htab = container_of(map, struct bpf_htab, map); return 0; } static void bpf_iter_fini_hash_map(void *priv_data) { struct bpf_iter_seq_hash_map_info *seq_info = priv_data; bpf_map_put_with_uref(seq_info->map); kfree(seq_info->percpu_value_buf); } static const struct seq_operations bpf_hash_map_seq_ops = { .start = bpf_hash_map_seq_start, .next = bpf_hash_map_seq_next, .stop = bpf_hash_map_seq_stop, .show = bpf_hash_map_seq_show, }; static const struct bpf_iter_seq_info iter_seq_info = { .seq_ops = &bpf_hash_map_seq_ops, .init_seq_private = bpf_iter_init_hash_map, .fini_seq_private = bpf_iter_fini_hash_map, .seq_priv_size = sizeof(struct bpf_iter_seq_hash_map_info), }; static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn, void *callback_ctx, u64 flags) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct hlist_nulls_head *head; struct hlist_nulls_node *n; struct htab_elem *elem; u32 roundup_key_size; int i, num_elems = 0; void __percpu *pptr; struct bucket *b; void *key, *val; bool is_percpu; u64 ret = 0; cant_migrate(); if (flags != 0) return -EINVAL; is_percpu = htab_is_percpu(htab); roundup_key_size = round_up(map->key_size, 8); /* migration has been disabled, so percpu value prepared here will be * the same as the one seen by the bpf program with * bpf_map_lookup_elem(). */ for (i = 0; i < htab->n_buckets; i++) { b = &htab->buckets[i]; rcu_read_lock(); head = &b->head; hlist_nulls_for_each_entry_safe(elem, n, head, hash_node) { key = elem->key; if (is_percpu) { /* current cpu value for percpu map */ pptr = htab_elem_get_ptr(elem, map->key_size); val = this_cpu_ptr(pptr); } else { val = elem->key + roundup_key_size; } num_elems++; ret = callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)val, (u64)(long)callback_ctx, 0); /* return value: 0 - continue, 1 - stop and return */ if (ret) { rcu_read_unlock(); goto out; } } rcu_read_unlock(); } out: return num_elems; } static u64 htab_map_mem_usage(const struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); u32 value_size = round_up(htab->map.value_size, 8); bool prealloc = htab_is_prealloc(htab); bool percpu = htab_is_percpu(htab); bool lru = htab_is_lru(htab); u64 num_entries; u64 usage = sizeof(struct bpf_htab); usage += sizeof(struct bucket) * htab->n_buckets; usage += sizeof(int) * num_possible_cpus() * HASHTAB_MAP_LOCK_COUNT; if (prealloc) { num_entries = map->max_entries; if (htab_has_extra_elems(htab)) num_entries += num_possible_cpus(); usage += htab->elem_size * num_entries; if (percpu) usage += value_size * num_possible_cpus() * num_entries; else if (!lru) usage += sizeof(struct htab_elem *) * num_possible_cpus(); } else { #define LLIST_NODE_SZ sizeof(struct llist_node) num_entries = htab->use_percpu_counter ? percpu_counter_sum(&htab->pcount) : atomic_read(&htab->count); usage += (htab->elem_size + LLIST_NODE_SZ) * num_entries; if (percpu) { usage += (LLIST_NODE_SZ + sizeof(void *)) * num_entries; usage += value_size * num_possible_cpus() * num_entries; } } return usage; } BTF_ID_LIST_SINGLE(htab_map_btf_ids, struct, bpf_htab) const struct bpf_map_ops htab_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, .map_release_uref = htab_map_free_timers_and_wq, .map_lookup_elem = htab_map_lookup_elem, .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem, .map_update_elem = htab_map_update_elem, .map_delete_elem = htab_map_delete_elem, .map_gen_lookup = htab_map_gen_lookup, .map_seq_show_elem = htab_map_seq_show_elem, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, .map_mem_usage = htab_map_mem_usage, BATCH_OPS(htab), .map_btf_id = &htab_map_btf_ids[0], .iter_seq_info = &iter_seq_info, }; const struct bpf_map_ops htab_lru_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, .map_release_uref = htab_map_free_timers_and_wq, .map_lookup_elem = htab_lru_map_lookup_elem, .map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem, .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys, .map_update_elem = htab_lru_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, .map_gen_lookup = htab_lru_map_gen_lookup, .map_seq_show_elem = htab_map_seq_show_elem, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, .map_mem_usage = htab_map_mem_usage, BATCH_OPS(htab_lru), .map_btf_id = &htab_map_btf_ids[0], .iter_seq_info = &iter_seq_info, }; /* Called from eBPF program */ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) { struct htab_elem *l = __htab_map_lookup_elem(map, key); if (l) return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); else return NULL; } /* inline bpf_map_lookup_elem() call for per-CPU hashmap */ static int htab_percpu_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_insn *insn = insn_buf; if (!bpf_jit_supports_percpu_insn()) return -EOPNOTSUPP; BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, (void *(*)(struct bpf_map *map, void *key))NULL)); *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem); *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3); *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, offsetof(struct htab_elem, key) + roundup(map->key_size, 8)); *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0); *insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); return insn - insn_buf; } static void *htab_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) { struct htab_elem *l; if (cpu >= nr_cpu_ids) return NULL; l = __htab_map_lookup_elem(map, key); if (l) return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu); else return NULL; } static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key) { struct htab_elem *l = __htab_map_lookup_elem(map, key); if (l) { bpf_lru_node_set_ref(&l->lru_node); return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); } return NULL; } static void *htab_lru_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) { struct htab_elem *l; if (cpu >= nr_cpu_ids) return NULL; l = __htab_map_lookup_elem(map, key); if (l) { bpf_lru_node_set_ref(&l->lru_node); return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu); } return NULL; } int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) { struct htab_elem *l; void __percpu *pptr; int ret = -ENOENT; int cpu, off = 0; u32 size; /* per_cpu areas are zero-filled and bpf programs can only * access 'value_size' of them, so copying rounded areas * will not leak any kernel data */ size = round_up(map->value_size, 8); rcu_read_lock(); l = __htab_map_lookup_elem(map, key); if (!l) goto out; /* We do not mark LRU map element here in order to not mess up * eviction heuristics when user space does a map walk. */ pptr = htab_elem_get_ptr(l, map->key_size); for_each_possible_cpu(cpu) { copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu)); check_and_init_map_value(map, value + off); off += size; } ret = 0; out: rcu_read_unlock(); return ret; } int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); int ret; rcu_read_lock(); if (htab_is_lru(htab)) ret = __htab_lru_percpu_map_update_elem(map, key, value, map_flags, true); else ret = __htab_percpu_map_update_elem(map, key, value, map_flags, true); rcu_read_unlock(); return ret; } static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key, struct seq_file *m) { struct htab_elem *l; void __percpu *pptr; int cpu; rcu_read_lock(); l = __htab_map_lookup_elem(map, key); if (!l) { rcu_read_unlock(); return; } btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); seq_puts(m, ": {\n"); pptr = htab_elem_get_ptr(l, map->key_size); for_each_possible_cpu(cpu) { seq_printf(m, "\tcpu%d: ", cpu); btf_type_seq_show(map->btf, map->btf_value_type_id, per_cpu_ptr(pptr, cpu), m); seq_putc(m, '\n'); } seq_puts(m, "}\n"); rcu_read_unlock(); } const struct bpf_map_ops htab_percpu_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, .map_lookup_elem = htab_percpu_map_lookup_elem, .map_gen_lookup = htab_percpu_map_gen_lookup, .map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem, .map_update_elem = htab_percpu_map_update_elem, .map_delete_elem = htab_map_delete_elem, .map_lookup_percpu_elem = htab_percpu_map_lookup_percpu_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, .map_mem_usage = htab_map_mem_usage, BATCH_OPS(htab_percpu), .map_btf_id = &htab_map_btf_ids[0], .iter_seq_info = &iter_seq_info, }; const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, .map_lookup_elem = htab_lru_percpu_map_lookup_elem, .map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem, .map_update_elem = htab_lru_percpu_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, .map_lookup_percpu_elem = htab_lru_percpu_map_lookup_percpu_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, .map_mem_usage = htab_map_mem_usage, BATCH_OPS(htab_lru_percpu), .map_btf_id = &htab_map_btf_ids[0], .iter_seq_info = &iter_seq_info, }; static int fd_htab_map_alloc_check(union bpf_attr *attr) { if (attr->value_size != sizeof(u32)) return -EINVAL; return htab_map_alloc_check(attr); } static void fd_htab_map_free(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct hlist_nulls_node *n; struct hlist_nulls_head *head; struct htab_elem *l; int i; for (i = 0; i < htab->n_buckets; i++) { head = select_bucket(htab, i); hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { void *ptr = fd_htab_map_get_ptr(map, l); map->ops->map_fd_put_ptr(map, ptr, false); } } htab_map_free(map); } /* only called from syscall */ int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) { void **ptr; int ret = 0; if (!map->ops->map_fd_sys_lookup_elem) return -ENOTSUPP; rcu_read_lock(); ptr = htab_map_lookup_elem(map, key); if (ptr) *value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr)); else ret = -ENOENT; rcu_read_unlock(); return ret; } /* only called from syscall */ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags) { void *ptr; int ret; u32 ufd = *(u32 *)value; ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); if (IS_ERR(ptr)) return PTR_ERR(ptr); /* The htab bucket lock is always held during update operations in fd * htab map, and the following rcu_read_lock() is only used to avoid * the WARN_ON_ONCE in htab_map_update_elem(). */ rcu_read_lock(); ret = htab_map_update_elem(map, key, &ptr, map_flags); rcu_read_unlock(); if (ret) map->ops->map_fd_put_ptr(map, ptr, false); return ret; } static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr) { struct bpf_map *map, *inner_map_meta; inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd); if (IS_ERR(inner_map_meta)) return inner_map_meta; map = htab_map_alloc(attr); if (IS_ERR(map)) { bpf_map_meta_free(inner_map_meta); return map; } map->inner_map_meta = inner_map_meta; return map; } static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_map **inner_map = htab_map_lookup_elem(map, key); if (!inner_map) return NULL; return READ_ONCE(*inner_map); } static int htab_of_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_insn *insn = insn_buf; const int ret = BPF_REG_0; BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, (void *(*)(struct bpf_map *map, void *key))NULL)); *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem); *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2); *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, offsetof(struct htab_elem, key) + round_up(map->key_size, 8)); *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0); return insn - insn_buf; } static void htab_of_map_free(struct bpf_map *map) { bpf_map_meta_free(map->inner_map_meta); fd_htab_map_free(map); } const struct bpf_map_ops htab_of_maps_map_ops = { .map_alloc_check = fd_htab_map_alloc_check, .map_alloc = htab_of_map_alloc, .map_free = htab_of_map_free, .map_get_next_key = htab_map_get_next_key, .map_lookup_elem = htab_of_map_lookup_elem, .map_delete_elem = htab_map_delete_elem, .map_fd_get_ptr = bpf_map_fd_get_ptr, .map_fd_put_ptr = bpf_map_fd_put_ptr, .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, .map_gen_lookup = htab_of_map_gen_lookup, .map_check_btf = map_check_no_btf, .map_mem_usage = htab_map_mem_usage, BATCH_OPS(htab), .map_btf_id = &htab_map_btf_ids[0], };
24 11 10 1 1 2 1 3 3 2 2 30 1 1 13 14 1 26 1 2 1 1 10 2 12 4 12 8 1 8 3 1 19 18 1 6 2 2 5 5 1 4 2 9 1 1 1 2 2 2 2 2 2 1 3 6 4 2 114 114 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/act_pedit.c Generic packet editor * * Authors: Jamal Hadi Salim (2002-4) */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/module.h> #include <linux/init.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/slab.h> #include <net/ipv6.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <linux/tc_act/tc_pedit.h> #include <net/tc_act/tc_pedit.h> #include <uapi/linux/tc_act/tc_pedit.h> #include <net/pkt_cls.h> #include <net/tc_wrapper.h> static struct tc_action_ops act_pedit_ops; static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = { [TCA_PEDIT_PARMS] = { .len = sizeof(struct tc_pedit) }, [TCA_PEDIT_PARMS_EX] = { .len = sizeof(struct tc_pedit) }, [TCA_PEDIT_KEYS_EX] = { .type = NLA_NESTED }, }; static const struct nla_policy pedit_key_ex_policy[TCA_PEDIT_KEY_EX_MAX + 1] = { [TCA_PEDIT_KEY_EX_HTYPE] = NLA_POLICY_MAX(NLA_U16, TCA_PEDIT_HDR_TYPE_MAX), [TCA_PEDIT_KEY_EX_CMD] = NLA_POLICY_MAX(NLA_U16, TCA_PEDIT_CMD_MAX), }; static struct tcf_pedit_key_ex *tcf_pedit_keys_ex_parse(struct nlattr *nla, u8 n, struct netlink_ext_ack *extack) { struct tcf_pedit_key_ex *keys_ex; struct tcf_pedit_key_ex *k; const struct nlattr *ka; int err = -EINVAL; int rem; if (!nla) return NULL; keys_ex = kcalloc(n, sizeof(*k), GFP_KERNEL); if (!keys_ex) return ERR_PTR(-ENOMEM); k = keys_ex; nla_for_each_nested(ka, nla, rem) { struct nlattr *tb[TCA_PEDIT_KEY_EX_MAX + 1]; if (!n) { NL_SET_ERR_MSG_MOD(extack, "Can't parse more extended keys than requested"); err = -EINVAL; goto err_out; } n--; if (nla_type(ka) != TCA_PEDIT_KEY_EX) { NL_SET_ERR_MSG_ATTR(extack, ka, "Unknown attribute, expected extended key"); err = -EINVAL; goto err_out; } err = nla_parse_nested_deprecated(tb, TCA_PEDIT_KEY_EX_MAX, ka, pedit_key_ex_policy, NULL); if (err) goto err_out; if (NL_REQ_ATTR_CHECK(extack, nla, tb, TCA_PEDIT_KEY_EX_HTYPE)) { NL_SET_ERR_MSG(extack, "Missing required attribute"); err = -EINVAL; goto err_out; } if (NL_REQ_ATTR_CHECK(extack, nla, tb, TCA_PEDIT_KEY_EX_CMD)) { NL_SET_ERR_MSG(extack, "Missing required attribute"); err = -EINVAL; goto err_out; } k->htype = nla_get_u16(tb[TCA_PEDIT_KEY_EX_HTYPE]); k->cmd = nla_get_u16(tb[TCA_PEDIT_KEY_EX_CMD]); k++; } if (n) { NL_SET_ERR_MSG_MOD(extack, "Not enough extended keys to parse"); err = -EINVAL; goto err_out; } return keys_ex; err_out: kfree(keys_ex); return ERR_PTR(err); } static int tcf_pedit_key_ex_dump(struct sk_buff *skb, struct tcf_pedit_key_ex *keys_ex, int n) { struct nlattr *keys_start = nla_nest_start_noflag(skb, TCA_PEDIT_KEYS_EX); if (!keys_start) goto nla_failure; for (; n > 0; n--) { struct nlattr *key_start; key_start = nla_nest_start_noflag(skb, TCA_PEDIT_KEY_EX); if (!key_start) goto nla_failure; if (nla_put_u16(skb, TCA_PEDIT_KEY_EX_HTYPE, keys_ex->htype) || nla_put_u16(skb, TCA_PEDIT_KEY_EX_CMD, keys_ex->cmd)) goto nla_failure; nla_nest_end(skb, key_start); keys_ex++; } nla_nest_end(skb, keys_start); return 0; nla_failure: nla_nest_cancel(skb, keys_start); return -EINVAL; } static void tcf_pedit_cleanup_rcu(struct rcu_head *head) { struct tcf_pedit_parms *parms = container_of(head, struct tcf_pedit_parms, rcu); kfree(parms->tcfp_keys_ex); kfree(parms->tcfp_keys); kfree(parms); } static int tcf_pedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_pedit_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct tcf_chain *goto_ch = NULL; struct tcf_pedit_parms *oparms, *nparms; struct nlattr *tb[TCA_PEDIT_MAX + 1]; struct tc_pedit *parm; struct nlattr *pattr; struct tcf_pedit *p; int ret = 0, err; int i, ksize; u32 index; if (!nla) { NL_SET_ERR_MSG_MOD(extack, "Pedit requires attributes to be passed"); return -EINVAL; } err = nla_parse_nested_deprecated(tb, TCA_PEDIT_MAX, nla, pedit_policy, NULL); if (err < 0) return err; pattr = tb[TCA_PEDIT_PARMS]; if (!pattr) pattr = tb[TCA_PEDIT_PARMS_EX]; if (!pattr) { NL_SET_ERR_MSG_MOD(extack, "Missing required TCA_PEDIT_PARMS or TCA_PEDIT_PARMS_EX pedit attribute"); return -EINVAL; } parm = nla_data(pattr); index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (!err) { ret = tcf_idr_create_from_flags(tn, index, est, a, &act_pedit_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; } ret = ACT_P_CREATED; } else if (err > 0) { if (bind) return ACT_P_BOUND; if (!(flags & TCA_ACT_FLAGS_REPLACE)) { ret = -EEXIST; goto out_release; } } else { return err; } if (!parm->nkeys) { NL_SET_ERR_MSG_MOD(extack, "Pedit requires keys to be passed"); ret = -EINVAL; goto out_release; } ksize = parm->nkeys * sizeof(struct tc_pedit_key); if (nla_len(pattr) < sizeof(*parm) + ksize) { NL_SET_ERR_MSG_ATTR(extack, pattr, "Length of TCA_PEDIT_PARMS or TCA_PEDIT_PARMS_EX pedit attribute is invalid"); ret = -EINVAL; goto out_release; } nparms = kzalloc(sizeof(*nparms), GFP_KERNEL); if (!nparms) { ret = -ENOMEM; goto out_release; } nparms->tcfp_keys_ex = tcf_pedit_keys_ex_parse(tb[TCA_PEDIT_KEYS_EX], parm->nkeys, extack); if (IS_ERR(nparms->tcfp_keys_ex)) { ret = PTR_ERR(nparms->tcfp_keys_ex); goto out_free; } err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) { ret = err; goto out_free_ex; } nparms->tcfp_off_max_hint = 0; nparms->tcfp_flags = parm->flags; nparms->tcfp_nkeys = parm->nkeys; nparms->tcfp_keys = kmemdup(parm->keys, ksize, GFP_KERNEL); if (!nparms->tcfp_keys) { ret = -ENOMEM; goto put_chain; } for (i = 0; i < nparms->tcfp_nkeys; ++i) { u32 offmask = nparms->tcfp_keys[i].offmask; u32 cur = nparms->tcfp_keys[i].off; /* The AT option can be added to static offsets in the datapath */ if (!offmask && cur % 4) { NL_SET_ERR_MSG_MOD(extack, "Offsets must be on 32bit boundaries"); ret = -EINVAL; goto out_free_keys; } /* sanitize the shift value for any later use */ nparms->tcfp_keys[i].shift = min_t(size_t, BITS_PER_TYPE(int) - 1, nparms->tcfp_keys[i].shift); /* The AT option can read a single byte, we can bound the actual * value with uchar max. */ cur += (0xff & offmask) >> nparms->tcfp_keys[i].shift; /* Each key touches 4 bytes starting from the computed offset */ nparms->tcfp_off_max_hint = max(nparms->tcfp_off_max_hint, cur + 4); } p = to_pedit(*a); spin_lock_bh(&p->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); oparms = rcu_replace_pointer(p->parms, nparms, 1); spin_unlock_bh(&p->tcf_lock); if (oparms) call_rcu(&oparms->rcu, tcf_pedit_cleanup_rcu); if (goto_ch) tcf_chain_put_by_act(goto_ch); return ret; out_free_keys: kfree(nparms->tcfp_keys); put_chain: if (goto_ch) tcf_chain_put_by_act(goto_ch); out_free_ex: kfree(nparms->tcfp_keys_ex); out_free: kfree(nparms); out_release: tcf_idr_release(*a, bind); return ret; } static void tcf_pedit_cleanup(struct tc_action *a) { struct tcf_pedit *p = to_pedit(a); struct tcf_pedit_parms *parms; parms = rcu_dereference_protected(p->parms, 1); if (parms) call_rcu(&parms->rcu, tcf_pedit_cleanup_rcu); } static bool offset_valid(struct sk_buff *skb, int offset) { if (offset > 0 && offset > skb->len) return false; if (offset < 0 && -offset > skb_headroom(skb)) return false; return true; } static int pedit_l4_skb_offset(struct sk_buff *skb, int *hoffset, const int header_type) { const int noff = skb_network_offset(skb); int ret = -EINVAL; struct iphdr _iph; switch (skb->protocol) { case htons(ETH_P_IP): { const struct iphdr *iph = skb_header_pointer(skb, noff, sizeof(_iph), &_iph); if (!iph) goto out; *hoffset = noff + iph->ihl * 4; ret = 0; break; } case htons(ETH_P_IPV6): ret = ipv6_find_hdr(skb, hoffset, header_type, NULL, NULL) == header_type ? 0 : -EINVAL; break; } out: return ret; } static int pedit_skb_hdr_offset(struct sk_buff *skb, enum pedit_header_type htype, int *hoffset) { int ret = -EINVAL; /* 'htype' is validated in the netlink parsing */ switch (htype) { case TCA_PEDIT_KEY_EX_HDR_TYPE_ETH: if (skb_mac_header_was_set(skb)) { *hoffset = skb_mac_offset(skb); ret = 0; } break; case TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK: case TCA_PEDIT_KEY_EX_HDR_TYPE_IP4: case TCA_PEDIT_KEY_EX_HDR_TYPE_IP6: *hoffset = skb_network_offset(skb); ret = 0; break; case TCA_PEDIT_KEY_EX_HDR_TYPE_TCP: ret = pedit_l4_skb_offset(skb, hoffset, IPPROTO_TCP); break; case TCA_PEDIT_KEY_EX_HDR_TYPE_UDP: ret = pedit_l4_skb_offset(skb, hoffset, IPPROTO_UDP); break; default: break; } return ret; } TC_INDIRECT_SCOPE int tcf_pedit_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { enum pedit_header_type htype = TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK; enum pedit_cmd cmd = TCA_PEDIT_KEY_EX_CMD_SET; struct tcf_pedit *p = to_pedit(a); struct tcf_pedit_key_ex *tkey_ex; struct tcf_pedit_parms *parms; struct tc_pedit_key *tkey; u32 max_offset; int i; parms = rcu_dereference_bh(p->parms); max_offset = (skb_transport_header_was_set(skb) ? skb_transport_offset(skb) : skb_network_offset(skb)) + parms->tcfp_off_max_hint; if (skb_ensure_writable(skb, min(skb->len, max_offset))) goto done; tcf_lastuse_update(&p->tcf_tm); tcf_action_update_bstats(&p->common, skb); tkey = parms->tcfp_keys; tkey_ex = parms->tcfp_keys_ex; for (i = parms->tcfp_nkeys; i > 0; i--, tkey++) { int offset = tkey->off; int hoffset = 0; u32 *ptr, hdata; u32 val; int rc; if (tkey_ex) { htype = tkey_ex->htype; cmd = tkey_ex->cmd; tkey_ex++; } rc = pedit_skb_hdr_offset(skb, htype, &hoffset); if (rc) { pr_info_ratelimited("tc action pedit unable to extract header offset for header type (0x%x)\n", htype); goto bad; } if (tkey->offmask) { u8 *d, _d; if (!offset_valid(skb, hoffset + tkey->at)) { pr_info_ratelimited("tc action pedit 'at' offset %d out of bounds\n", hoffset + tkey->at); goto bad; } d = skb_header_pointer(skb, hoffset + tkey->at, sizeof(_d), &_d); if (!d) goto bad; offset += (*d & tkey->offmask) >> tkey->shift; if (offset % 4) { pr_info_ratelimited("tc action pedit offset must be on 32 bit boundaries\n"); goto bad; } } if (!offset_valid(skb, hoffset + offset)) { pr_info_ratelimited("tc action pedit offset %d out of bounds\n", hoffset + offset); goto bad; } ptr = skb_header_pointer(skb, hoffset + offset, sizeof(hdata), &hdata); if (!ptr) goto bad; /* just do it, baby */ switch (cmd) { case TCA_PEDIT_KEY_EX_CMD_SET: val = tkey->val; break; case TCA_PEDIT_KEY_EX_CMD_ADD: val = (*ptr + tkey->val) & ~tkey->mask; break; default: pr_info_ratelimited("tc action pedit bad command (%d)\n", cmd); goto bad; } *ptr = ((*ptr & tkey->mask) ^ val); if (ptr == &hdata) skb_store_bits(skb, hoffset + offset, ptr, 4); } goto done; bad: tcf_action_inc_overlimit_qstats(&p->common); done: return p->tcf_action; } static void tcf_pedit_stats_update(struct tc_action *a, u64 bytes, u64 packets, u64 drops, u64 lastuse, bool hw) { struct tcf_pedit *d = to_pedit(a); struct tcf_t *tm = &d->tcf_tm; tcf_action_update_stats(a, bytes, packets, drops, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_pedit *p = to_pedit(a); struct tcf_pedit_parms *parms; struct tc_pedit *opt; struct tcf_t t; int s; spin_lock_bh(&p->tcf_lock); parms = rcu_dereference_protected(p->parms, 1); s = struct_size(opt, keys, parms->tcfp_nkeys); opt = kzalloc(s, GFP_ATOMIC); if (unlikely(!opt)) { spin_unlock_bh(&p->tcf_lock); return -ENOBUFS; } opt->nkeys = parms->tcfp_nkeys; memcpy(opt->keys, parms->tcfp_keys, flex_array_size(opt, keys, parms->tcfp_nkeys)); opt->index = p->tcf_index; opt->flags = parms->tcfp_flags; opt->action = p->tcf_action; opt->refcnt = refcount_read(&p->tcf_refcnt) - ref; opt->bindcnt = atomic_read(&p->tcf_bindcnt) - bind; if (parms->tcfp_keys_ex) { if (tcf_pedit_key_ex_dump(skb, parms->tcfp_keys_ex, parms->tcfp_nkeys)) goto nla_put_failure; if (nla_put(skb, TCA_PEDIT_PARMS_EX, s, opt)) goto nla_put_failure; } else { if (nla_put(skb, TCA_PEDIT_PARMS, s, opt)) goto nla_put_failure; } tcf_tm_dump(&t, &p->tcf_tm); if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD)) goto nla_put_failure; spin_unlock_bh(&p->tcf_lock); kfree(opt); return skb->len; nla_put_failure: spin_unlock_bh(&p->tcf_lock); nlmsg_trim(skb, b); kfree(opt); return -1; } static int tcf_pedit_offload_act_setup(struct tc_action *act, void *entry_data, u32 *index_inc, bool bind, struct netlink_ext_ack *extack) { if (bind) { struct flow_action_entry *entry = entry_data; int k; for (k = 0; k < tcf_pedit_nkeys(act); k++) { switch (tcf_pedit_cmd(act, k)) { case TCA_PEDIT_KEY_EX_CMD_SET: entry->id = FLOW_ACTION_MANGLE; break; case TCA_PEDIT_KEY_EX_CMD_ADD: entry->id = FLOW_ACTION_ADD; break; default: NL_SET_ERR_MSG_MOD(extack, "Unsupported pedit command offload"); return -EOPNOTSUPP; } entry->mangle.htype = tcf_pedit_htype(act, k); entry->mangle.mask = tcf_pedit_mask(act, k); entry->mangle.val = tcf_pedit_val(act, k); entry->mangle.offset = tcf_pedit_offset(act, k); entry->hw_stats = tc_act_hw_stats(act->hw_stats); entry++; } *index_inc = k; } else { struct flow_offload_action *fl_action = entry_data; u32 cmd = tcf_pedit_cmd(act, 0); int k; switch (cmd) { case TCA_PEDIT_KEY_EX_CMD_SET: fl_action->id = FLOW_ACTION_MANGLE; break; case TCA_PEDIT_KEY_EX_CMD_ADD: fl_action->id = FLOW_ACTION_ADD; break; default: NL_SET_ERR_MSG_MOD(extack, "Unsupported pedit command offload"); return -EOPNOTSUPP; } for (k = 1; k < tcf_pedit_nkeys(act); k++) { if (cmd != tcf_pedit_cmd(act, k)) { NL_SET_ERR_MSG_MOD(extack, "Unsupported pedit command offload"); return -EOPNOTSUPP; } } } return 0; } static struct tc_action_ops act_pedit_ops = { .kind = "pedit", .id = TCA_ID_PEDIT, .owner = THIS_MODULE, .act = tcf_pedit_act, .stats_update = tcf_pedit_stats_update, .dump = tcf_pedit_dump, .cleanup = tcf_pedit_cleanup, .init = tcf_pedit_init, .offload_act_setup = tcf_pedit_offload_act_setup, .size = sizeof(struct tcf_pedit), }; MODULE_ALIAS_NET_ACT("pedit"); static __net_init int pedit_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_pedit_ops.net_id); return tc_action_net_init(net, tn, &act_pedit_ops); } static void __net_exit pedit_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_pedit_ops.net_id); } static struct pernet_operations pedit_net_ops = { .init = pedit_init_net, .exit_batch = pedit_exit_net, .id = &act_pedit_ops.net_id, .size = sizeof(struct tc_action_net), }; MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); MODULE_DESCRIPTION("Generic Packet Editor actions"); MODULE_LICENSE("GPL"); static int __init pedit_init_module(void) { return tcf_register_action(&act_pedit_ops, &pedit_net_ops); } static void __exit pedit_cleanup_module(void) { tcf_unregister_action(&act_pedit_ops, &pedit_net_ops); } module_init(pedit_init_module); module_exit(pedit_cleanup_module);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright(c) 2020 Intel Corporation. */ #ifndef XSK_BUFF_POOL_H_ #define XSK_BUFF_POOL_H_ #include <linux/if_xdp.h> #include <linux/types.h> #include <linux/dma-mapping.h> #include <linux/bpf.h> #include <net/xdp.h> struct xsk_buff_pool; struct xdp_rxq_info; struct xsk_cb_desc; struct xsk_queue; struct xdp_desc; struct xdp_umem; struct xdp_sock; struct device; struct page; #define XSK_PRIV_MAX 24 struct xdp_buff_xsk { struct xdp_buff xdp; u8 cb[XSK_PRIV_MAX]; dma_addr_t dma; dma_addr_t frame_dma; struct xsk_buff_pool *pool; struct list_head list_node; } __aligned_largest; #define XSK_CHECK_PRIV_TYPE(t) BUILD_BUG_ON(sizeof(t) > offsetofend(struct xdp_buff_xsk, cb)) #define XSK_TX_COMPL_FITS(t) BUILD_BUG_ON(sizeof(struct xsk_tx_metadata_compl) > sizeof(t)) struct xsk_dma_map { dma_addr_t *dma_pages; struct device *dev; struct net_device *netdev; refcount_t users; struct list_head list; /* Protected by the RTNL_LOCK */ u32 dma_pages_cnt; }; struct xsk_buff_pool { /* Members only used in the control path first. */ struct device *dev; struct net_device *netdev; struct list_head xsk_tx_list; /* Protects modifications to the xsk_tx_list */ spinlock_t xsk_tx_list_lock; refcount_t users; struct xdp_umem *umem; struct work_struct work; /* Protects generic receive in shared and non-shared umem mode. */ spinlock_t rx_lock; struct list_head free_list; struct list_head xskb_list; u32 heads_cnt; u16 queue_id; /* Data path members as close to free_heads at the end as possible. */ struct xsk_queue *fq ____cacheline_aligned_in_smp; struct xsk_queue *cq; /* For performance reasons, each buff pool has its own array of dma_pages * even when they are identical. */ dma_addr_t *dma_pages; struct xdp_buff_xsk *heads; struct xdp_desc *tx_descs; u64 chunk_mask; u64 addrs_cnt; u32 free_list_cnt; u32 dma_pages_cnt; u32 free_heads_cnt; u32 headroom; u32 chunk_size; u32 chunk_shift; u32 frame_len; u32 xdp_zc_max_segs; u8 tx_metadata_len; /* inherited from umem */ u8 cached_need_wakeup; bool uses_need_wakeup; bool unaligned; bool tx_sw_csum; void *addrs; /* Mutual exclusion of the completion ring in the SKB mode. Two cases to protect: * NAPI TX thread and sendmsg error paths in the SKB destructor callback and when * sockets share a single cq when the same netdev and queue id is shared. */ spinlock_t cq_lock; struct xdp_buff_xsk *free_heads[]; }; /* Masks for xdp_umem_page flags. * The low 12-bits of the addr will be 0 since this is the page address, so we * can use them for flags. */ #define XSK_NEXT_PG_CONTIG_SHIFT 0 #define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT) /* AF_XDP core. */ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, struct xdp_umem *umem); int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *dev, u16 queue_id, u16 flags); int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_sock *umem_xs, struct net_device *dev, u16 queue_id); int xp_alloc_tx_descs(struct xsk_buff_pool *pool, struct xdp_sock *xs); void xp_destroy(struct xsk_buff_pool *pool); void xp_get_pool(struct xsk_buff_pool *pool); bool xp_put_pool(struct xsk_buff_pool *pool); void xp_clear_dev(struct xsk_buff_pool *pool); void xp_add_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs); void xp_del_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs); /* AF_XDP, and XDP core. */ void xp_free(struct xdp_buff_xsk *xskb); static inline void xp_init_xskb_addr(struct xdp_buff_xsk *xskb, struct xsk_buff_pool *pool, u64 addr) { xskb->xdp.data_hard_start = pool->addrs + addr + pool->headroom; } static inline void xp_init_xskb_dma(struct xdp_buff_xsk *xskb, struct xsk_buff_pool *pool, dma_addr_t *dma_pages, u64 addr) { xskb->frame_dma = (dma_pages[addr >> PAGE_SHIFT] & ~XSK_NEXT_PG_CONTIG_MASK) + (addr & ~PAGE_MASK); xskb->dma = xskb->frame_dma + pool->headroom + XDP_PACKET_HEADROOM; } /* AF_XDP ZC drivers, via xdp_sock_buff.h */ void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq); void xp_fill_cb(struct xsk_buff_pool *pool, struct xsk_cb_desc *desc); int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, unsigned long attrs, struct page **pages, u32 nr_pages); void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs); struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool); u32 xp_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max); bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count); void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr); dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr); struct xdp_desc_ctx { dma_addr_t dma; struct xsk_tx_metadata *meta; }; struct xdp_desc_ctx xp_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr); static inline dma_addr_t xp_get_dma(struct xdp_buff_xsk *xskb) { return xskb->dma; } static inline dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb) { return xskb->frame_dma; } static inline void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb) { dma_sync_single_for_cpu(xskb->pool->dev, xskb->dma, xskb->pool->frame_len, DMA_BIDIRECTIONAL); } static inline void xp_dma_sync_for_device(struct xsk_buff_pool *pool, dma_addr_t dma, size_t size) { dma_sync_single_for_device(pool->dev, dma, size, DMA_BIDIRECTIONAL); } /* Masks for xdp_umem_page flags. * The low 12-bits of the addr will be 0 since this is the page address, so we * can use them for flags. */ #define XSK_NEXT_PG_CONTIG_SHIFT 0 #define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT) static inline bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool, u64 addr, u32 len) { bool cross_pg = (addr & (PAGE_SIZE - 1)) + len > PAGE_SIZE; if (likely(!cross_pg)) return false; return pool->dma_pages && !(pool->dma_pages[addr >> PAGE_SHIFT] & XSK_NEXT_PG_CONTIG_MASK); } static inline bool xp_mb_desc(const struct xdp_desc *desc) { return desc->options & XDP_PKT_CONTD; } static inline u64 xp_aligned_extract_addr(struct xsk_buff_pool *pool, u64 addr) { return addr & pool->chunk_mask; } static inline u64 xp_unaligned_extract_addr(u64 addr) { return addr & XSK_UNALIGNED_BUF_ADDR_MASK; } static inline u64 xp_unaligned_extract_offset(u64 addr) { return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT; } static inline u64 xp_unaligned_add_offset_to_addr(u64 addr) { return xp_unaligned_extract_addr(addr) + xp_unaligned_extract_offset(addr); } static inline u32 xp_aligned_extract_idx(struct xsk_buff_pool *pool, u64 addr) { return xp_aligned_extract_addr(pool, addr) >> pool->chunk_shift; } static inline void xp_release(struct xdp_buff_xsk *xskb) { if (xskb->pool->unaligned) xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb; } static inline u64 xp_get_handle(struct xdp_buff_xsk *xskb, struct xsk_buff_pool *pool) { u64 orig_addr = xskb->xdp.data - pool->addrs; u64 offset; if (!pool->unaligned) return orig_addr; offset = xskb->xdp.data - xskb->xdp.data_hard_start; offset += pool->headroom; orig_addr -= offset; return orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); } static inline bool xp_tx_metadata_enabled(const struct xsk_buff_pool *pool) { return pool->tx_metadata_len > 0; } #endif /* XSK_BUFF_POOL_H_ */
12 25 18 8 29 30 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ #include "cookie.h" #include "peer.h" #include "device.h" #include "messages.h" #include "ratelimiter.h" #include "timers.h" #include <crypto/blake2s.h> #include <crypto/chacha20poly1305.h> #include <crypto/utils.h> #include <net/ipv6.h> void wg_cookie_checker_init(struct cookie_checker *checker, struct wg_device *wg) { init_rwsem(&checker->secret_lock); checker->secret_birthdate = ktime_get_coarse_boottime_ns(); get_random_bytes(checker->secret, NOISE_HASH_LEN); checker->device = wg; } enum { COOKIE_KEY_LABEL_LEN = 8 }; static const u8 mac1_key_label[COOKIE_KEY_LABEL_LEN] = "mac1----"; static const u8 cookie_key_label[COOKIE_KEY_LABEL_LEN] = "cookie--"; static void precompute_key(u8 key[NOISE_SYMMETRIC_KEY_LEN], const u8 pubkey[NOISE_PUBLIC_KEY_LEN], const u8 label[COOKIE_KEY_LABEL_LEN]) { struct blake2s_state blake; blake2s_init(&blake, NOISE_SYMMETRIC_KEY_LEN); blake2s_update(&blake, label, COOKIE_KEY_LABEL_LEN); blake2s_update(&blake, pubkey, NOISE_PUBLIC_KEY_LEN); blake2s_final(&blake, key); } /* Must hold peer->handshake.static_identity->lock */ void wg_cookie_checker_precompute_device_keys(struct cookie_checker *checker) { if (likely(checker->device->static_identity.has_identity)) { precompute_key(checker->cookie_encryption_key, checker->device->static_identity.static_public, cookie_key_label); precompute_key(checker->message_mac1_key, checker->device->static_identity.static_public, mac1_key_label); } else { memset(checker->cookie_encryption_key, 0, NOISE_SYMMETRIC_KEY_LEN); memset(checker->message_mac1_key, 0, NOISE_SYMMETRIC_KEY_LEN); } } void wg_cookie_checker_precompute_peer_keys(struct wg_peer *peer) { precompute_key(peer->latest_cookie.cookie_decryption_key, peer->handshake.remote_static, cookie_key_label); precompute_key(peer->latest_cookie.message_mac1_key, peer->handshake.remote_static, mac1_key_label); } void wg_cookie_init(struct cookie *cookie) { memset(cookie, 0, sizeof(*cookie)); init_rwsem(&cookie->lock); } static void compute_mac1(u8 mac1[COOKIE_LEN], const void *message, size_t len, const u8 key[NOISE_SYMMETRIC_KEY_LEN]) { len = len - sizeof(struct message_macs) + offsetof(struct message_macs, mac1); blake2s(mac1, message, key, COOKIE_LEN, len, NOISE_SYMMETRIC_KEY_LEN); } static void compute_mac2(u8 mac2[COOKIE_LEN], const void *message, size_t len, const u8 cookie[COOKIE_LEN]) { len = len - sizeof(struct message_macs) + offsetof(struct message_macs, mac2); blake2s(mac2, message, cookie, COOKIE_LEN, len, COOKIE_LEN); } static void make_cookie(u8 cookie[COOKIE_LEN], struct sk_buff *skb, struct cookie_checker *checker) { struct blake2s_state state; if (wg_birthdate_has_expired(checker->secret_birthdate, COOKIE_SECRET_MAX_AGE)) { down_write(&checker->secret_lock); checker->secret_birthdate = ktime_get_coarse_boottime_ns(); get_random_bytes(checker->secret, NOISE_HASH_LEN); up_write(&checker->secret_lock); } down_read(&checker->secret_lock); blake2s_init_key(&state, COOKIE_LEN, checker->secret, NOISE_HASH_LEN); if (skb->protocol == htons(ETH_P_IP)) blake2s_update(&state, (u8 *)&ip_hdr(skb)->saddr, sizeof(struct in_addr)); else if (skb->protocol == htons(ETH_P_IPV6)) blake2s_update(&state, (u8 *)&ipv6_hdr(skb)->saddr, sizeof(struct in6_addr)); blake2s_update(&state, (u8 *)&udp_hdr(skb)->source, sizeof(__be16)); blake2s_final(&state, cookie); up_read(&checker->secret_lock); } enum cookie_mac_state wg_cookie_validate_packet(struct cookie_checker *checker, struct sk_buff *skb, bool check_cookie) { struct message_macs *macs = (struct message_macs *) (skb->data + skb->len - sizeof(*macs)); enum cookie_mac_state ret; u8 computed_mac[COOKIE_LEN]; u8 cookie[COOKIE_LEN]; ret = INVALID_MAC; compute_mac1(computed_mac, skb->data, skb->len, checker->message_mac1_key); if (crypto_memneq(computed_mac, macs->mac1, COOKIE_LEN)) goto out; ret = VALID_MAC_BUT_NO_COOKIE; if (!check_cookie) goto out; make_cookie(cookie, skb, checker); compute_mac2(computed_mac, skb->data, skb->len, cookie); if (crypto_memneq(computed_mac, macs->mac2, COOKIE_LEN)) goto out; ret = VALID_MAC_WITH_COOKIE_BUT_RATELIMITED; if (!wg_ratelimiter_allow(skb, dev_net(checker->device->dev))) goto out; ret = VALID_MAC_WITH_COOKIE; out: return ret; } void wg_cookie_add_mac_to_packet(void *message, size_t len, struct wg_peer *peer) { struct message_macs *macs = (struct message_macs *) ((u8 *)message + len - sizeof(*macs)); down_write(&peer->latest_cookie.lock); compute_mac1(macs->mac1, message, len, peer->latest_cookie.message_mac1_key); memcpy(peer->latest_cookie.last_mac1_sent, macs->mac1, COOKIE_LEN); peer->latest_cookie.have_sent_mac1 = true; up_write(&peer->latest_cookie.lock); down_read(&peer->latest_cookie.lock); if (peer->latest_cookie.is_valid && !wg_birthdate_has_expired(peer->latest_cookie.birthdate, COOKIE_SECRET_MAX_AGE - COOKIE_SECRET_LATENCY)) compute_mac2(macs->mac2, message, len, peer->latest_cookie.cookie); else memset(macs->mac2, 0, COOKIE_LEN); up_read(&peer->latest_cookie.lock); } void wg_cookie_message_create(struct message_handshake_cookie *dst, struct sk_buff *skb, __le32 index, struct cookie_checker *checker) { struct message_macs *macs = (struct message_macs *) ((u8 *)skb->data + skb->len - sizeof(*macs)); u8 cookie[COOKIE_LEN]; dst->header.type = cpu_to_le32(MESSAGE_HANDSHAKE_COOKIE); dst->receiver_index = index; get_random_bytes_wait(dst->nonce, COOKIE_NONCE_LEN); make_cookie(cookie, skb, checker); xchacha20poly1305_encrypt(dst->encrypted_cookie, cookie, COOKIE_LEN, macs->mac1, COOKIE_LEN, dst->nonce, checker->cookie_encryption_key); } void wg_cookie_message_consume(struct message_handshake_cookie *src, struct wg_device *wg) { struct wg_peer *peer = NULL; u8 cookie[COOKIE_LEN]; bool ret; if (unlikely(!wg_index_hashtable_lookup(wg->index_hashtable, INDEX_HASHTABLE_HANDSHAKE | INDEX_HASHTABLE_KEYPAIR, src->receiver_index, &peer))) return; down_read(&peer->latest_cookie.lock); if (unlikely(!peer->latest_cookie.have_sent_mac1)) { up_read(&peer->latest_cookie.lock); goto out; } ret = xchacha20poly1305_decrypt( cookie, src->encrypted_cookie, sizeof(src->encrypted_cookie), peer->latest_cookie.last_mac1_sent, COOKIE_LEN, src->nonce, peer->latest_cookie.cookie_decryption_key); up_read(&peer->latest_cookie.lock); if (ret) { down_write(&peer->latest_cookie.lock); memcpy(peer->latest_cookie.cookie, cookie, COOKIE_LEN); peer->latest_cookie.birthdate = ktime_get_coarse_boottime_ns(); peer->latest_cookie.is_valid = true; peer->latest_cookie.have_sent_mac1 = false; up_write(&peer->latest_cookie.lock); } else { net_dbg_ratelimited("%s: Could not decrypt invalid cookie response\n", wg->dev->name); } out: wg_peer_put(peer); }
16 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 // SPDX-License-Identifier: GPL-2.0-or-later /* * X.25 Packet Layer release 002 * * This is ALPHA test software. This code may break your machine, * randomly fail to work with new releases, misbehave and/or generally * screw up. It might even work. * * This code REQUIRES 2.1.15 or higher * * History * X.25 001 Jonathan Naylor Started coding. * X.25 002 Jonathan Naylor New timer architecture. * Centralised disconnection processing. */ #include <linux/errno.h> #include <linux/jiffies.h> #include <linux/timer.h> #include <net/sock.h> #include <net/tcp_states.h> #include <net/x25.h> static void x25_heartbeat_expiry(struct timer_list *t); static void x25_timer_expiry(struct timer_list *t); void x25_init_timers(struct sock *sk) { struct x25_sock *x25 = x25_sk(sk); timer_setup(&x25->timer, x25_timer_expiry, 0); /* initialized by sock_init_data */ sk->sk_timer.function = x25_heartbeat_expiry; } void x25_start_heartbeat(struct sock *sk) { mod_timer(&sk->sk_timer, jiffies + 5 * HZ); } void x25_stop_heartbeat(struct sock *sk) { timer_delete(&sk->sk_timer); } void x25_start_t2timer(struct sock *sk) { struct x25_sock *x25 = x25_sk(sk); mod_timer(&x25->timer, jiffies + x25->t2); } void x25_start_t21timer(struct sock *sk) { struct x25_sock *x25 = x25_sk(sk); mod_timer(&x25->timer, jiffies + x25->t21); } void x25_start_t22timer(struct sock *sk) { struct x25_sock *x25 = x25_sk(sk); mod_timer(&x25->timer, jiffies + x25->t22); } void x25_start_t23timer(struct sock *sk) { struct x25_sock *x25 = x25_sk(sk); mod_timer(&x25->timer, jiffies + x25->t23); } void x25_stop_timer(struct sock *sk) { timer_delete(&x25_sk(sk)->timer); } unsigned long x25_display_timer(struct sock *sk) { struct x25_sock *x25 = x25_sk(sk); if (!timer_pending(&x25->timer)) return 0; return x25->timer.expires - jiffies; } static void x25_heartbeat_expiry(struct timer_list *t) { struct sock *sk = from_timer(sk, t, sk_timer); bh_lock_sock(sk); if (sock_owned_by_user(sk)) /* can currently only occur in state 3 */ goto restart_heartbeat; switch (x25_sk(sk)->state) { case X25_STATE_0: /* * Magic here: If we listen() and a new link dies * before it is accepted() it isn't 'dead' so doesn't * get removed. */ if (sock_flag(sk, SOCK_DESTROY) || (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_DEAD))) { bh_unlock_sock(sk); x25_destroy_socket_from_timer(sk); return; } break; case X25_STATE_3: /* * Check for the state of the receive buffer. */ x25_check_rbuf(sk); break; } restart_heartbeat: x25_start_heartbeat(sk); bh_unlock_sock(sk); } /* * Timer has expired, it may have been T2, T21, T22, or T23. We can tell * by the state machine state. */ static inline void x25_do_timer_expiry(struct sock * sk) { struct x25_sock *x25 = x25_sk(sk); switch (x25->state) { case X25_STATE_3: /* T2 */ if (x25->condition & X25_COND_ACK_PENDING) { x25->condition &= ~X25_COND_ACK_PENDING; x25_enquiry_response(sk); } break; case X25_STATE_1: /* T21 */ case X25_STATE_4: /* T22 */ x25_write_internal(sk, X25_CLEAR_REQUEST); x25->state = X25_STATE_2; x25_start_t23timer(sk); break; case X25_STATE_2: /* T23 */ x25_disconnect(sk, ETIMEDOUT, 0, 0); break; } } static void x25_timer_expiry(struct timer_list *t) { struct x25_sock *x25 = from_timer(x25, t, timer); struct sock *sk = &x25->sk; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* can currently only occur in state 3 */ if (x25_sk(sk)->state == X25_STATE_3) x25_start_t2timer(sk); } else x25_do_timer_expiry(sk); bh_unlock_sock(sk); }
30 15 14 2 19 2 1 2 2 2 21 1 1 4 4 8 1 5 5 17 3 2 1 1 1 1 1 3 17 1 1 9 7 8 7 3 10 7 1 6 1 1 2 25 1 17 7 17 6 3 19 2 4 4 4 2 2 3 2 2 2 2 2 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 // SPDX-License-Identifier: GPL-2.0-or-later /* * NetLabel Management Support * * This file defines the management functions for the NetLabel system. The * NetLabel system manages static and dynamic label mappings for network * protocols such as CIPSO and RIPSO. * * Author: Paul Moore <paul@paul-moore.com> */ /* * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008 */ #include <linux/types.h> #include <linux/socket.h> #include <linux/string.h> #include <linux/skbuff.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/slab.h> #include <net/sock.h> #include <net/netlink.h> #include <net/genetlink.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/netlabel.h> #include <net/cipso_ipv4.h> #include <net/calipso.h> #include <linux/atomic.h> #include "netlabel_calipso.h" #include "netlabel_domainhash.h" #include "netlabel_user.h" #include "netlabel_mgmt.h" /* NetLabel configured protocol counter */ atomic_t netlabel_mgmt_protocount = ATOMIC_INIT(0); /* Argument struct for netlbl_domhsh_walk() */ struct netlbl_domhsh_walk_arg { struct netlink_callback *nl_cb; struct sk_buff *skb; u32 seq; }; /* NetLabel Generic NETLINK CIPSOv4 family */ static struct genl_family netlbl_mgmt_gnl_family; /* NetLabel Netlink attribute policy */ static const struct nla_policy netlbl_mgmt_genl_policy[NLBL_MGMT_A_MAX + 1] = { [NLBL_MGMT_A_DOMAIN] = { .type = NLA_NUL_STRING }, [NLBL_MGMT_A_PROTOCOL] = { .type = NLA_U32 }, [NLBL_MGMT_A_VERSION] = { .type = NLA_U32 }, [NLBL_MGMT_A_CV4DOI] = { .type = NLA_U32 }, [NLBL_MGMT_A_FAMILY] = { .type = NLA_U16 }, [NLBL_MGMT_A_CLPDOI] = { .type = NLA_U32 }, }; /* * Helper Functions */ /** * netlbl_mgmt_add_common - Handle an ADD message * @info: the Generic NETLINK info block * @audit_info: NetLabel audit information * * Description: * Helper function for the ADD and ADDDEF messages to add the domain mappings * from the message to the hash table. See netlabel.h for a description of the * message format. Returns zero on success, negative values on failure. * */ static int netlbl_mgmt_add_common(struct genl_info *info, struct netlbl_audit *audit_info) { void *pmap = NULL; int ret_val = -EINVAL; struct netlbl_domaddr_map *addrmap = NULL; struct cipso_v4_doi *cipsov4 = NULL; #if IS_ENABLED(CONFIG_IPV6) struct calipso_doi *calipso = NULL; #endif u32 tmp_val; struct netlbl_dom_map *entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return -ENOMEM; entry->def.type = nla_get_u32(info->attrs[NLBL_MGMT_A_PROTOCOL]); if (info->attrs[NLBL_MGMT_A_DOMAIN]) { size_t tmp_size = nla_len(info->attrs[NLBL_MGMT_A_DOMAIN]); entry->domain = kmalloc(tmp_size, GFP_KERNEL); if (entry->domain == NULL) { ret_val = -ENOMEM; goto add_free_entry; } nla_strscpy(entry->domain, info->attrs[NLBL_MGMT_A_DOMAIN], tmp_size); } /* NOTE: internally we allow/use a entry->def.type value of * NETLBL_NLTYPE_ADDRSELECT but we don't currently allow users * to pass that as a protocol value because we need to know the * "real" protocol */ switch (entry->def.type) { case NETLBL_NLTYPE_UNLABELED: entry->family = nla_get_u16_default(info->attrs[NLBL_MGMT_A_FAMILY], AF_UNSPEC); break; case NETLBL_NLTYPE_CIPSOV4: if (!info->attrs[NLBL_MGMT_A_CV4DOI]) goto add_free_domain; tmp_val = nla_get_u32(info->attrs[NLBL_MGMT_A_CV4DOI]); cipsov4 = cipso_v4_doi_getdef(tmp_val); if (cipsov4 == NULL) goto add_free_domain; entry->family = AF_INET; entry->def.cipso = cipsov4; break; #if IS_ENABLED(CONFIG_IPV6) case NETLBL_NLTYPE_CALIPSO: if (!info->attrs[NLBL_MGMT_A_CLPDOI]) goto add_free_domain; tmp_val = nla_get_u32(info->attrs[NLBL_MGMT_A_CLPDOI]); calipso = calipso_doi_getdef(tmp_val); if (calipso == NULL) goto add_free_domain; entry->family = AF_INET6; entry->def.calipso = calipso; break; #endif /* IPv6 */ default: goto add_free_domain; } if ((entry->family == AF_INET && info->attrs[NLBL_MGMT_A_IPV6ADDR]) || (entry->family == AF_INET6 && info->attrs[NLBL_MGMT_A_IPV4ADDR])) goto add_doi_put_def; if (info->attrs[NLBL_MGMT_A_IPV4ADDR]) { struct in_addr *addr; struct in_addr *mask; struct netlbl_domaddr4_map *map; addrmap = kzalloc(sizeof(*addrmap), GFP_KERNEL); if (addrmap == NULL) { ret_val = -ENOMEM; goto add_doi_put_def; } INIT_LIST_HEAD(&addrmap->list4); INIT_LIST_HEAD(&addrmap->list6); if (nla_len(info->attrs[NLBL_MGMT_A_IPV4ADDR]) != sizeof(struct in_addr)) { ret_val = -EINVAL; goto add_free_addrmap; } if (nla_len(info->attrs[NLBL_MGMT_A_IPV4MASK]) != sizeof(struct in_addr)) { ret_val = -EINVAL; goto add_free_addrmap; } addr = nla_data(info->attrs[NLBL_MGMT_A_IPV4ADDR]); mask = nla_data(info->attrs[NLBL_MGMT_A_IPV4MASK]); map = kzalloc(sizeof(*map), GFP_KERNEL); if (map == NULL) { ret_val = -ENOMEM; goto add_free_addrmap; } pmap = map; map->list.addr = addr->s_addr & mask->s_addr; map->list.mask = mask->s_addr; map->list.valid = 1; map->def.type = entry->def.type; if (cipsov4) map->def.cipso = cipsov4; ret_val = netlbl_af4list_add(&map->list, &addrmap->list4); if (ret_val != 0) goto add_free_map; entry->family = AF_INET; entry->def.type = NETLBL_NLTYPE_ADDRSELECT; entry->def.addrsel = addrmap; #if IS_ENABLED(CONFIG_IPV6) } else if (info->attrs[NLBL_MGMT_A_IPV6ADDR]) { struct in6_addr *addr; struct in6_addr *mask; struct netlbl_domaddr6_map *map; addrmap = kzalloc(sizeof(*addrmap), GFP_KERNEL); if (addrmap == NULL) { ret_val = -ENOMEM; goto add_doi_put_def; } INIT_LIST_HEAD(&addrmap->list4); INIT_LIST_HEAD(&addrmap->list6); if (nla_len(info->attrs[NLBL_MGMT_A_IPV6ADDR]) != sizeof(struct in6_addr)) { ret_val = -EINVAL; goto add_free_addrmap; } if (nla_len(info->attrs[NLBL_MGMT_A_IPV6MASK]) != sizeof(struct in6_addr)) { ret_val = -EINVAL; goto add_free_addrmap; } addr = nla_data(info->attrs[NLBL_MGMT_A_IPV6ADDR]); mask = nla_data(info->attrs[NLBL_MGMT_A_IPV6MASK]); map = kzalloc(sizeof(*map), GFP_KERNEL); if (map == NULL) { ret_val = -ENOMEM; goto add_free_addrmap; } pmap = map; map->list.addr = *addr; map->list.addr.s6_addr32[0] &= mask->s6_addr32[0]; map->list.addr.s6_addr32[1] &= mask->s6_addr32[1]; map->list.addr.s6_addr32[2] &= mask->s6_addr32[2]; map->list.addr.s6_addr32[3] &= mask->s6_addr32[3]; map->list.mask = *mask; map->list.valid = 1; map->def.type = entry->def.type; if (calipso) map->def.calipso = calipso; ret_val = netlbl_af6list_add(&map->list, &addrmap->list6); if (ret_val != 0) goto add_free_map; entry->family = AF_INET6; entry->def.type = NETLBL_NLTYPE_ADDRSELECT; entry->def.addrsel = addrmap; #endif /* IPv6 */ } ret_val = netlbl_domhsh_add(entry, audit_info); if (ret_val != 0) goto add_free_map; return 0; add_free_map: kfree(pmap); add_free_addrmap: kfree(addrmap); add_doi_put_def: cipso_v4_doi_putdef(cipsov4); #if IS_ENABLED(CONFIG_IPV6) calipso_doi_putdef(calipso); #endif add_free_domain: kfree(entry->domain); add_free_entry: kfree(entry); return ret_val; } /** * netlbl_mgmt_listentry - List a NetLabel/LSM domain map entry * @skb: the NETLINK buffer * @entry: the map entry * * Description: * This function is a helper function used by the LISTALL and LISTDEF command * handlers. The caller is responsible for ensuring that the RCU read lock * is held. Returns zero on success, negative values on failure. * */ static int netlbl_mgmt_listentry(struct sk_buff *skb, struct netlbl_dom_map *entry) { int ret_val = 0; struct nlattr *nla_a; struct nlattr *nla_b; struct netlbl_af4list *iter4; #if IS_ENABLED(CONFIG_IPV6) struct netlbl_af6list *iter6; #endif if (entry->domain != NULL) { ret_val = nla_put_string(skb, NLBL_MGMT_A_DOMAIN, entry->domain); if (ret_val != 0) return ret_val; } ret_val = nla_put_u16(skb, NLBL_MGMT_A_FAMILY, entry->family); if (ret_val != 0) return ret_val; switch (entry->def.type) { case NETLBL_NLTYPE_ADDRSELECT: nla_a = nla_nest_start_noflag(skb, NLBL_MGMT_A_SELECTORLIST); if (nla_a == NULL) return -ENOMEM; netlbl_af4list_foreach_rcu(iter4, &entry->def.addrsel->list4) { struct netlbl_domaddr4_map *map4; struct in_addr addr_struct; nla_b = nla_nest_start_noflag(skb, NLBL_MGMT_A_ADDRSELECTOR); if (nla_b == NULL) return -ENOMEM; addr_struct.s_addr = iter4->addr; ret_val = nla_put_in_addr(skb, NLBL_MGMT_A_IPV4ADDR, addr_struct.s_addr); if (ret_val != 0) return ret_val; addr_struct.s_addr = iter4->mask; ret_val = nla_put_in_addr(skb, NLBL_MGMT_A_IPV4MASK, addr_struct.s_addr); if (ret_val != 0) return ret_val; map4 = netlbl_domhsh_addr4_entry(iter4); ret_val = nla_put_u32(skb, NLBL_MGMT_A_PROTOCOL, map4->def.type); if (ret_val != 0) return ret_val; switch (map4->def.type) { case NETLBL_NLTYPE_CIPSOV4: ret_val = nla_put_u32(skb, NLBL_MGMT_A_CV4DOI, map4->def.cipso->doi); if (ret_val != 0) return ret_val; break; } nla_nest_end(skb, nla_b); } #if IS_ENABLED(CONFIG_IPV6) netlbl_af6list_foreach_rcu(iter6, &entry->def.addrsel->list6) { struct netlbl_domaddr6_map *map6; nla_b = nla_nest_start_noflag(skb, NLBL_MGMT_A_ADDRSELECTOR); if (nla_b == NULL) return -ENOMEM; ret_val = nla_put_in6_addr(skb, NLBL_MGMT_A_IPV6ADDR, &iter6->addr); if (ret_val != 0) return ret_val; ret_val = nla_put_in6_addr(skb, NLBL_MGMT_A_IPV6MASK, &iter6->mask); if (ret_val != 0) return ret_val; map6 = netlbl_domhsh_addr6_entry(iter6); ret_val = nla_put_u32(skb, NLBL_MGMT_A_PROTOCOL, map6->def.type); if (ret_val != 0) return ret_val; switch (map6->def.type) { case NETLBL_NLTYPE_CALIPSO: ret_val = nla_put_u32(skb, NLBL_MGMT_A_CLPDOI, map6->def.calipso->doi); if (ret_val != 0) return ret_val; break; } nla_nest_end(skb, nla_b); } #endif /* IPv6 */ nla_nest_end(skb, nla_a); break; case NETLBL_NLTYPE_UNLABELED: ret_val = nla_put_u32(skb, NLBL_MGMT_A_PROTOCOL, entry->def.type); break; case NETLBL_NLTYPE_CIPSOV4: ret_val = nla_put_u32(skb, NLBL_MGMT_A_PROTOCOL, entry->def.type); if (ret_val != 0) return ret_val; ret_val = nla_put_u32(skb, NLBL_MGMT_A_CV4DOI, entry->def.cipso->doi); break; case NETLBL_NLTYPE_CALIPSO: ret_val = nla_put_u32(skb, NLBL_MGMT_A_PROTOCOL, entry->def.type); if (ret_val != 0) return ret_val; ret_val = nla_put_u32(skb, NLBL_MGMT_A_CLPDOI, entry->def.calipso->doi); break; } return ret_val; } /* * NetLabel Command Handlers */ /** * netlbl_mgmt_add - Handle an ADD message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Process a user generated ADD message and add the domains from the message * to the hash table. See netlabel.h for a description of the message format. * Returns zero on success, negative values on failure. * */ static int netlbl_mgmt_add(struct sk_buff *skb, struct genl_info *info) { struct netlbl_audit audit_info; if ((!info->attrs[NLBL_MGMT_A_DOMAIN]) || (!info->attrs[NLBL_MGMT_A_PROTOCOL]) || (info->attrs[NLBL_MGMT_A_IPV4ADDR] && info->attrs[NLBL_MGMT_A_IPV6ADDR]) || (info->attrs[NLBL_MGMT_A_IPV4MASK] && info->attrs[NLBL_MGMT_A_IPV6MASK]) || ((info->attrs[NLBL_MGMT_A_IPV4ADDR] != NULL) ^ (info->attrs[NLBL_MGMT_A_IPV4MASK] != NULL)) || ((info->attrs[NLBL_MGMT_A_IPV6ADDR] != NULL) ^ (info->attrs[NLBL_MGMT_A_IPV6MASK] != NULL))) return -EINVAL; netlbl_netlink_auditinfo(&audit_info); return netlbl_mgmt_add_common(info, &audit_info); } /** * netlbl_mgmt_remove - Handle a REMOVE message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Process a user generated REMOVE message and remove the specified domain * mappings. Returns zero on success, negative values on failure. * */ static int netlbl_mgmt_remove(struct sk_buff *skb, struct genl_info *info) { char *domain; struct netlbl_audit audit_info; if (!info->attrs[NLBL_MGMT_A_DOMAIN]) return -EINVAL; netlbl_netlink_auditinfo(&audit_info); domain = nla_data(info->attrs[NLBL_MGMT_A_DOMAIN]); return netlbl_domhsh_remove(domain, AF_UNSPEC, &audit_info); } /** * netlbl_mgmt_listall_cb - netlbl_domhsh_walk() callback for LISTALL * @entry: the domain mapping hash table entry * @arg: the netlbl_domhsh_walk_arg structure * * Description: * This function is designed to be used as a callback to the * netlbl_domhsh_walk() function for use in generating a response for a LISTALL * message. Returns the size of the message on success, negative values on * failure. * */ static int netlbl_mgmt_listall_cb(struct netlbl_dom_map *entry, void *arg) { int ret_val = -ENOMEM; struct netlbl_domhsh_walk_arg *cb_arg = arg; void *data; data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).portid, cb_arg->seq, &netlbl_mgmt_gnl_family, NLM_F_MULTI, NLBL_MGMT_C_LISTALL); if (data == NULL) goto listall_cb_failure; ret_val = netlbl_mgmt_listentry(cb_arg->skb, entry); if (ret_val != 0) goto listall_cb_failure; cb_arg->seq++; genlmsg_end(cb_arg->skb, data); return 0; listall_cb_failure: genlmsg_cancel(cb_arg->skb, data); return ret_val; } /** * netlbl_mgmt_listall - Handle a LISTALL message * @skb: the NETLINK buffer * @cb: the NETLINK callback * * Description: * Process a user generated LISTALL message and dumps the domain hash table in * a form suitable for use in a kernel generated LISTALL message. Returns zero * on success, negative values on failure. * */ static int netlbl_mgmt_listall(struct sk_buff *skb, struct netlink_callback *cb) { struct netlbl_domhsh_walk_arg cb_arg; u32 skip_bkt = cb->args[0]; u32 skip_chain = cb->args[1]; cb_arg.nl_cb = cb; cb_arg.skb = skb; cb_arg.seq = cb->nlh->nlmsg_seq; netlbl_domhsh_walk(&skip_bkt, &skip_chain, netlbl_mgmt_listall_cb, &cb_arg); cb->args[0] = skip_bkt; cb->args[1] = skip_chain; return skb->len; } /** * netlbl_mgmt_adddef - Handle an ADDDEF message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Process a user generated ADDDEF message and respond accordingly. Returns * zero on success, negative values on failure. * */ static int netlbl_mgmt_adddef(struct sk_buff *skb, struct genl_info *info) { struct netlbl_audit audit_info; if ((!info->attrs[NLBL_MGMT_A_PROTOCOL]) || (info->attrs[NLBL_MGMT_A_IPV4ADDR] && info->attrs[NLBL_MGMT_A_IPV6ADDR]) || (info->attrs[NLBL_MGMT_A_IPV4MASK] && info->attrs[NLBL_MGMT_A_IPV6MASK]) || ((info->attrs[NLBL_MGMT_A_IPV4ADDR] != NULL) ^ (info->attrs[NLBL_MGMT_A_IPV4MASK] != NULL)) || ((info->attrs[NLBL_MGMT_A_IPV6ADDR] != NULL) ^ (info->attrs[NLBL_MGMT_A_IPV6MASK] != NULL))) return -EINVAL; netlbl_netlink_auditinfo(&audit_info); return netlbl_mgmt_add_common(info, &audit_info); } /** * netlbl_mgmt_removedef - Handle a REMOVEDEF message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Process a user generated REMOVEDEF message and remove the default domain * mapping. Returns zero on success, negative values on failure. * */ static int netlbl_mgmt_removedef(struct sk_buff *skb, struct genl_info *info) { struct netlbl_audit audit_info; netlbl_netlink_auditinfo(&audit_info); return netlbl_domhsh_remove_default(AF_UNSPEC, &audit_info); } /** * netlbl_mgmt_listdef - Handle a LISTDEF message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Process a user generated LISTDEF message and dumps the default domain * mapping in a form suitable for use in a kernel generated LISTDEF message. * Returns zero on success, negative values on failure. * */ static int netlbl_mgmt_listdef(struct sk_buff *skb, struct genl_info *info) { int ret_val = -ENOMEM; struct sk_buff *ans_skb = NULL; void *data; struct netlbl_dom_map *entry; u16 family; family = nla_get_u16_default(info->attrs[NLBL_MGMT_A_FAMILY], AF_INET); ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (ans_skb == NULL) return -ENOMEM; data = genlmsg_put_reply(ans_skb, info, &netlbl_mgmt_gnl_family, 0, NLBL_MGMT_C_LISTDEF); if (data == NULL) goto listdef_failure; rcu_read_lock(); entry = netlbl_domhsh_getentry(NULL, family); if (entry == NULL) { ret_val = -ENOENT; goto listdef_failure_lock; } ret_val = netlbl_mgmt_listentry(ans_skb, entry); rcu_read_unlock(); if (ret_val != 0) goto listdef_failure; genlmsg_end(ans_skb, data); return genlmsg_reply(ans_skb, info); listdef_failure_lock: rcu_read_unlock(); listdef_failure: kfree_skb(ans_skb); return ret_val; } /** * netlbl_mgmt_protocols_cb - Write an individual PROTOCOL message response * @skb: the skb to write to * @cb: the NETLINK callback * @protocol: the NetLabel protocol to use in the message * * Description: * This function is to be used in conjunction with netlbl_mgmt_protocols() to * answer a application's PROTOCOLS message. Returns the size of the message * on success, negative values on failure. * */ static int netlbl_mgmt_protocols_cb(struct sk_buff *skb, struct netlink_callback *cb, u32 protocol) { int ret_val = -ENOMEM; void *data; data = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &netlbl_mgmt_gnl_family, NLM_F_MULTI, NLBL_MGMT_C_PROTOCOLS); if (data == NULL) goto protocols_cb_failure; ret_val = nla_put_u32(skb, NLBL_MGMT_A_PROTOCOL, protocol); if (ret_val != 0) goto protocols_cb_failure; genlmsg_end(skb, data); return 0; protocols_cb_failure: genlmsg_cancel(skb, data); return ret_val; } /** * netlbl_mgmt_protocols - Handle a PROTOCOLS message * @skb: the NETLINK buffer * @cb: the NETLINK callback * * Description: * Process a user generated PROTOCOLS message and respond accordingly. * */ static int netlbl_mgmt_protocols(struct sk_buff *skb, struct netlink_callback *cb) { u32 protos_sent = cb->args[0]; if (protos_sent == 0) { if (netlbl_mgmt_protocols_cb(skb, cb, NETLBL_NLTYPE_UNLABELED) < 0) goto protocols_return; protos_sent++; } if (protos_sent == 1) { if (netlbl_mgmt_protocols_cb(skb, cb, NETLBL_NLTYPE_CIPSOV4) < 0) goto protocols_return; protos_sent++; } #if IS_ENABLED(CONFIG_IPV6) if (protos_sent == 2) { if (netlbl_mgmt_protocols_cb(skb, cb, NETLBL_NLTYPE_CALIPSO) < 0) goto protocols_return; protos_sent++; } #endif protocols_return: cb->args[0] = protos_sent; return skb->len; } /** * netlbl_mgmt_version - Handle a VERSION message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Process a user generated VERSION message and respond accordingly. Returns * zero on success, negative values on failure. * */ static int netlbl_mgmt_version(struct sk_buff *skb, struct genl_info *info) { int ret_val = -ENOMEM; struct sk_buff *ans_skb = NULL; void *data; ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (ans_skb == NULL) return -ENOMEM; data = genlmsg_put_reply(ans_skb, info, &netlbl_mgmt_gnl_family, 0, NLBL_MGMT_C_VERSION); if (data == NULL) goto version_failure; ret_val = nla_put_u32(ans_skb, NLBL_MGMT_A_VERSION, NETLBL_PROTO_VERSION); if (ret_val != 0) goto version_failure; genlmsg_end(ans_skb, data); return genlmsg_reply(ans_skb, info); version_failure: kfree_skb(ans_skb); return ret_val; } /* * NetLabel Generic NETLINK Command Definitions */ static const struct genl_small_ops netlbl_mgmt_genl_ops[] = { { .cmd = NLBL_MGMT_C_ADD, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = netlbl_mgmt_add, .dumpit = NULL, }, { .cmd = NLBL_MGMT_C_REMOVE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = netlbl_mgmt_remove, .dumpit = NULL, }, { .cmd = NLBL_MGMT_C_LISTALL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, .doit = NULL, .dumpit = netlbl_mgmt_listall, }, { .cmd = NLBL_MGMT_C_ADDDEF, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = netlbl_mgmt_adddef, .dumpit = NULL, }, { .cmd = NLBL_MGMT_C_REMOVEDEF, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = netlbl_mgmt_removedef, .dumpit = NULL, }, { .cmd = NLBL_MGMT_C_LISTDEF, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, .doit = netlbl_mgmt_listdef, .dumpit = NULL, }, { .cmd = NLBL_MGMT_C_PROTOCOLS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, .doit = NULL, .dumpit = netlbl_mgmt_protocols, }, { .cmd = NLBL_MGMT_C_VERSION, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, .doit = netlbl_mgmt_version, .dumpit = NULL, }, }; static struct genl_family netlbl_mgmt_gnl_family __ro_after_init = { .hdrsize = 0, .name = NETLBL_NLTYPE_MGMT_NAME, .version = NETLBL_PROTO_VERSION, .maxattr = NLBL_MGMT_A_MAX, .policy = netlbl_mgmt_genl_policy, .module = THIS_MODULE, .small_ops = netlbl_mgmt_genl_ops, .n_small_ops = ARRAY_SIZE(netlbl_mgmt_genl_ops), .resv_start_op = NLBL_MGMT_C_VERSION + 1, }; /* * NetLabel Generic NETLINK Protocol Functions */ /** * netlbl_mgmt_genl_init - Register the NetLabel management component * * Description: * Register the NetLabel management component with the Generic NETLINK * mechanism. Returns zero on success, negative values on failure. * */ int __init netlbl_mgmt_genl_init(void) { return genl_register_family(&netlbl_mgmt_gnl_family); }
1763 1766 1414 1567 214 15 136 302 22 718 721 40 610 573 266 485 4 5 680 1017 1015 1016 799 1017 4 20 3 23 3 9 10 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NET_NEIGHBOUR_H #define _NET_NEIGHBOUR_H #include <linux/neighbour.h> /* * Generic neighbour manipulation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * * Changes: * * Harald Welte: <laforge@gnumonks.org> * - Add neighbour cache statistics like rtstat */ #include <linux/atomic.h> #include <linux/refcount.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/rcupdate.h> #include <linux/seq_file.h> #include <linux/bitmap.h> #include <linux/err.h> #include <linux/sysctl.h> #include <linux/workqueue.h> #include <net/rtnetlink.h> #include <net/neighbour_tables.h> /* * NUD stands for "neighbor unreachability detection" */ #define NUD_IN_TIMER (NUD_INCOMPLETE|NUD_REACHABLE|NUD_DELAY|NUD_PROBE) #define NUD_VALID (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE|NUD_PROBE|NUD_STALE|NUD_DELAY) #define NUD_CONNECTED (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE) struct neighbour; enum { NEIGH_VAR_MCAST_PROBES, NEIGH_VAR_UCAST_PROBES, NEIGH_VAR_APP_PROBES, NEIGH_VAR_MCAST_REPROBES, NEIGH_VAR_RETRANS_TIME, NEIGH_VAR_BASE_REACHABLE_TIME, NEIGH_VAR_DELAY_PROBE_TIME, NEIGH_VAR_INTERVAL_PROBE_TIME_MS, NEIGH_VAR_GC_STALETIME, NEIGH_VAR_QUEUE_LEN_BYTES, NEIGH_VAR_PROXY_QLEN, NEIGH_VAR_ANYCAST_DELAY, NEIGH_VAR_PROXY_DELAY, NEIGH_VAR_LOCKTIME, #define NEIGH_VAR_DATA_MAX (NEIGH_VAR_LOCKTIME + 1) /* Following are used as a second way to access one of the above */ NEIGH_VAR_QUEUE_LEN, /* same data as NEIGH_VAR_QUEUE_LEN_BYTES */ NEIGH_VAR_RETRANS_TIME_MS, /* same data as NEIGH_VAR_RETRANS_TIME */ NEIGH_VAR_BASE_REACHABLE_TIME_MS, /* same data as NEIGH_VAR_BASE_REACHABLE_TIME */ /* Following are used by "default" only */ NEIGH_VAR_GC_INTERVAL, NEIGH_VAR_GC_THRESH1, NEIGH_VAR_GC_THRESH2, NEIGH_VAR_GC_THRESH3, NEIGH_VAR_MAX }; struct neigh_parms { possible_net_t net; struct net_device *dev; netdevice_tracker dev_tracker; struct list_head list; int (*neigh_setup)(struct neighbour *); struct neigh_table *tbl; void *sysctl_table; int dead; refcount_t refcnt; struct rcu_head rcu_head; int reachable_time; u32 qlen; int data[NEIGH_VAR_DATA_MAX]; DECLARE_BITMAP(data_state, NEIGH_VAR_DATA_MAX); }; static inline void neigh_var_set(struct neigh_parms *p, int index, int val) { set_bit(index, p->data_state); p->data[index] = val; } #define NEIGH_VAR(p, attr) ((p)->data[NEIGH_VAR_ ## attr]) /* In ndo_neigh_setup, NEIGH_VAR_INIT should be used. * In other cases, NEIGH_VAR_SET should be used. */ #define NEIGH_VAR_INIT(p, attr, val) (NEIGH_VAR(p, attr) = val) #define NEIGH_VAR_SET(p, attr, val) neigh_var_set(p, NEIGH_VAR_ ## attr, val) static inline void neigh_parms_data_state_setall(struct neigh_parms *p) { bitmap_fill(p->data_state, NEIGH_VAR_DATA_MAX); } static inline void neigh_parms_data_state_cleanall(struct neigh_parms *p) { bitmap_zero(p->data_state, NEIGH_VAR_DATA_MAX); } struct neigh_statistics { unsigned long allocs; /* number of allocated neighs */ unsigned long destroys; /* number of destroyed neighs */ unsigned long hash_grows; /* number of hash resizes */ unsigned long res_failed; /* number of failed resolutions */ unsigned long lookups; /* number of lookups */ unsigned long hits; /* number of hits (among lookups) */ unsigned long rcv_probes_mcast; /* number of received mcast ipv6 */ unsigned long rcv_probes_ucast; /* number of received ucast ipv6 */ unsigned long periodic_gc_runs; /* number of periodic GC runs */ unsigned long forced_gc_runs; /* number of forced GC runs */ unsigned long unres_discards; /* number of unresolved drops */ unsigned long table_fulls; /* times even gc couldn't help */ }; #define NEIGH_CACHE_STAT_INC(tbl, field) this_cpu_inc((tbl)->stats->field) struct neighbour { struct hlist_node hash; struct hlist_node dev_list; struct neigh_table *tbl; struct neigh_parms *parms; unsigned long confirmed; unsigned long updated; rwlock_t lock; refcount_t refcnt; unsigned int arp_queue_len_bytes; struct sk_buff_head arp_queue; struct timer_list timer; unsigned long used; atomic_t probes; u8 nud_state; u8 type; u8 dead; u8 protocol; u32 flags; seqlock_t ha_lock; unsigned char ha[ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))] __aligned(8); struct hh_cache hh; int (*output)(struct neighbour *, struct sk_buff *); const struct neigh_ops *ops; struct list_head gc_list; struct list_head managed_list; struct rcu_head rcu; struct net_device *dev; netdevice_tracker dev_tracker; u8 primary_key[]; } __randomize_layout; struct neigh_ops { int family; void (*solicit)(struct neighbour *, struct sk_buff *); void (*error_report)(struct neighbour *, struct sk_buff *); int (*output)(struct neighbour *, struct sk_buff *); int (*connected_output)(struct neighbour *, struct sk_buff *); }; struct pneigh_entry { struct pneigh_entry *next; possible_net_t net; struct net_device *dev; netdevice_tracker dev_tracker; u32 flags; u8 protocol; u32 key[]; }; /* * neighbour table manipulation */ #define NEIGH_NUM_HASH_RND 4 struct neigh_hash_table { struct hlist_head *hash_heads; unsigned int hash_shift; __u32 hash_rnd[NEIGH_NUM_HASH_RND]; struct rcu_head rcu; }; struct neigh_table { int family; unsigned int entry_size; unsigned int key_len; __be16 protocol; __u32 (*hash)(const void *pkey, const struct net_device *dev, __u32 *hash_rnd); bool (*key_eq)(const struct neighbour *, const void *pkey); int (*constructor)(struct neighbour *); int (*pconstructor)(struct pneigh_entry *); void (*pdestructor)(struct pneigh_entry *); void (*proxy_redo)(struct sk_buff *skb); int (*is_multicast)(const void *pkey); bool (*allow_add)(const struct net_device *dev, struct netlink_ext_ack *extack); char *id; struct neigh_parms parms; struct list_head parms_list; int gc_interval; int gc_thresh1; int gc_thresh2; int gc_thresh3; unsigned long last_flush; struct delayed_work gc_work; struct delayed_work managed_work; struct timer_list proxy_timer; struct sk_buff_head proxy_queue; atomic_t entries; atomic_t gc_entries; struct list_head gc_list; struct list_head managed_list; rwlock_t lock; unsigned long last_rand; struct neigh_statistics __percpu *stats; struct neigh_hash_table __rcu *nht; struct pneigh_entry **phash_buckets; }; static inline int neigh_parms_family(struct neigh_parms *p) { return p->tbl->family; } #define NEIGH_PRIV_ALIGN sizeof(long long) #define NEIGH_ENTRY_SIZE(size) ALIGN((size), NEIGH_PRIV_ALIGN) static inline void *neighbour_priv(const struct neighbour *n) { return (char *)n + n->tbl->entry_size; } /* flags for neigh_update() */ #define NEIGH_UPDATE_F_OVERRIDE BIT(0) #define NEIGH_UPDATE_F_WEAK_OVERRIDE BIT(1) #define NEIGH_UPDATE_F_OVERRIDE_ISROUTER BIT(2) #define NEIGH_UPDATE_F_USE BIT(3) #define NEIGH_UPDATE_F_MANAGED BIT(4) #define NEIGH_UPDATE_F_EXT_LEARNED BIT(5) #define NEIGH_UPDATE_F_ISROUTER BIT(6) #define NEIGH_UPDATE_F_ADMIN BIT(7) /* In-kernel representation for NDA_FLAGS_EXT flags: */ #define NTF_OLD_MASK 0xff #define NTF_EXT_SHIFT 8 #define NTF_EXT_MASK (NTF_EXT_MANAGED) #define NTF_MANAGED (NTF_EXT_MANAGED << NTF_EXT_SHIFT) extern const struct nla_policy nda_policy[]; #define neigh_for_each_in_bucket(pos, head) hlist_for_each_entry(pos, head, hash) #define neigh_for_each_in_bucket_rcu(pos, head) \ hlist_for_each_entry_rcu(pos, head, hash) #define neigh_for_each_in_bucket_safe(pos, tmp, head) \ hlist_for_each_entry_safe(pos, tmp, head, hash) static inline bool neigh_key_eq32(const struct neighbour *n, const void *pkey) { return *(const u32 *)n->primary_key == *(const u32 *)pkey; } static inline bool neigh_key_eq128(const struct neighbour *n, const void *pkey) { const u32 *n32 = (const u32 *)n->primary_key; const u32 *p32 = pkey; return ((n32[0] ^ p32[0]) | (n32[1] ^ p32[1]) | (n32[2] ^ p32[2]) | (n32[3] ^ p32[3])) == 0; } static inline struct neighbour *___neigh_lookup_noref( struct neigh_table *tbl, bool (*key_eq)(const struct neighbour *n, const void *pkey), __u32 (*hash)(const void *pkey, const struct net_device *dev, __u32 *hash_rnd), const void *pkey, struct net_device *dev) { struct neigh_hash_table *nht = rcu_dereference(tbl->nht); struct neighbour *n; u32 hash_val; hash_val = hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift); neigh_for_each_in_bucket_rcu(n, &nht->hash_heads[hash_val]) if (n->dev == dev && key_eq(n, pkey)) return n; return NULL; } static inline struct neighbour *__neigh_lookup_noref(struct neigh_table *tbl, const void *pkey, struct net_device *dev) { return ___neigh_lookup_noref(tbl, tbl->key_eq, tbl->hash, pkey, dev); } static inline void neigh_confirm(struct neighbour *n) { if (n) { unsigned long now = jiffies; /* avoid dirtying neighbour */ if (READ_ONCE(n->confirmed) != now) WRITE_ONCE(n->confirmed, now); } } void neigh_table_init(int index, struct neigh_table *tbl); int neigh_table_clear(int index, struct neigh_table *tbl); struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, struct net_device *dev); struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, bool want_ref); static inline struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev) { return __neigh_create(tbl, pkey, dev, true); } void neigh_destroy(struct neighbour *neigh); int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb, const bool immediate_ok); int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u32 nlmsg_pid); void __neigh_set_probe_once(struct neighbour *neigh); bool neigh_remove_one(struct neighbour *ndel); void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev); int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev); int neigh_carrier_down(struct neigh_table *tbl, struct net_device *dev); int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb); int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb); int neigh_direct_output(struct neighbour *neigh, struct sk_buff *skb); struct neighbour *neigh_event_ns(struct neigh_table *tbl, u8 *lladdr, void *saddr, struct net_device *dev); struct neigh_parms *neigh_parms_alloc(struct net_device *dev, struct neigh_table *tbl); void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms); static inline struct net *neigh_parms_net(const struct neigh_parms *parms) { return read_pnet(&parms->net); } unsigned long neigh_rand_reach_time(unsigned long base); void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb); struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev, int creat); struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); static inline struct net *pneigh_net(const struct pneigh_entry *pneigh) { return read_pnet(&pneigh->net); } void neigh_app_ns(struct neighbour *n); void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie); void __neigh_for_each_release(struct neigh_table *tbl, int (*cb)(struct neighbour *)); int neigh_xmit(int fam, struct net_device *, const void *, struct sk_buff *); struct neigh_seq_state { struct seq_net_private p; struct neigh_table *tbl; struct neigh_hash_table *nht; void *(*neigh_sub_iter)(struct neigh_seq_state *state, struct neighbour *n, loff_t *pos); unsigned int bucket; unsigned int flags; #define NEIGH_SEQ_NEIGH_ONLY 0x00000001 #define NEIGH_SEQ_IS_PNEIGH 0x00000002 #define NEIGH_SEQ_SKIP_NOARP 0x00000004 }; void *neigh_seq_start(struct seq_file *, loff_t *, struct neigh_table *, unsigned int); void *neigh_seq_next(struct seq_file *, void *, loff_t *); void neigh_seq_stop(struct seq_file *, void *); int neigh_proc_dointvec(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); int neigh_proc_dointvec_jiffies(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); int neigh_proc_dointvec_ms_jiffies(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, proc_handler *proc_handler); void neigh_sysctl_unregister(struct neigh_parms *p); static inline void __neigh_parms_put(struct neigh_parms *parms) { refcount_dec(&parms->refcnt); } static inline struct neigh_parms *neigh_parms_clone(struct neigh_parms *parms) { refcount_inc(&parms->refcnt); return parms; } /* * Neighbour references */ static inline void neigh_release(struct neighbour *neigh) { if (refcount_dec_and_test(&neigh->refcnt)) neigh_destroy(neigh); } static inline struct neighbour * neigh_clone(struct neighbour *neigh) { if (neigh) refcount_inc(&neigh->refcnt); return neigh; } #define neigh_hold(n) refcount_inc(&(n)->refcnt) static __always_inline int neigh_event_send_probe(struct neighbour *neigh, struct sk_buff *skb, const bool immediate_ok) { unsigned long now = jiffies; if (READ_ONCE(neigh->used) != now) WRITE_ONCE(neigh->used, now); if (!(READ_ONCE(neigh->nud_state) & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE))) return __neigh_event_send(neigh, skb, immediate_ok); return 0; } static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) { return neigh_event_send_probe(neigh, skb, true); } #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb) { unsigned int seq, hh_alen; do { seq = read_seqbegin(&hh->hh_lock); hh_alen = HH_DATA_ALIGN(ETH_HLEN); memcpy(skb->data - hh_alen, hh->hh_data, ETH_ALEN + hh_alen - ETH_HLEN); } while (read_seqretry(&hh->hh_lock, seq)); return 0; } #endif static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb) { unsigned int hh_alen = 0; unsigned int seq; unsigned int hh_len; do { seq = read_seqbegin(&hh->hh_lock); hh_len = READ_ONCE(hh->hh_len); if (likely(hh_len <= HH_DATA_MOD)) { hh_alen = HH_DATA_MOD; /* skb_push() would proceed silently if we have room for * the unaligned size but not for the aligned size: * check headroom explicitly. */ if (likely(skb_headroom(skb) >= HH_DATA_MOD)) { /* this is inlined by gcc */ memcpy(skb->data - HH_DATA_MOD, hh->hh_data, HH_DATA_MOD); } } else { hh_alen = HH_DATA_ALIGN(hh_len); if (likely(skb_headroom(skb) >= hh_alen)) { memcpy(skb->data - hh_alen, hh->hh_data, hh_alen); } } } while (read_seqretry(&hh->hh_lock, seq)); if (WARN_ON_ONCE(skb_headroom(skb) < hh_alen)) { kfree_skb(skb); return NET_XMIT_DROP; } __skb_push(skb, hh_len); return dev_queue_xmit(skb); } static inline int neigh_output(struct neighbour *n, struct sk_buff *skb, bool skip_cache) { const struct hh_cache *hh = &n->hh; /* n->nud_state and hh->hh_len could be changed under us. * neigh_hh_output() is taking care of the race later. */ if (!skip_cache && (READ_ONCE(n->nud_state) & NUD_CONNECTED) && READ_ONCE(hh->hh_len)) return neigh_hh_output(hh, skb); return READ_ONCE(n->output)(n, skb); } static inline struct neighbour * __neigh_lookup(struct neigh_table *tbl, const void *pkey, struct net_device *dev, int creat) { struct neighbour *n = neigh_lookup(tbl, pkey, dev); if (n || !creat) return n; n = neigh_create(tbl, pkey, dev); return IS_ERR(n) ? NULL : n; } static inline struct neighbour * __neigh_lookup_errno(struct neigh_table *tbl, const void *pkey, struct net_device *dev) { struct neighbour *n = neigh_lookup(tbl, pkey, dev); if (n) return n; return neigh_create(tbl, pkey, dev); } struct neighbour_cb { unsigned long sched_next; unsigned int flags; }; #define LOCALLY_ENQUEUED 0x1 #define NEIGH_CB(skb) ((struct neighbour_cb *)(skb)->cb) static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n, const struct net_device *dev) { unsigned int seq; do { seq = read_seqbegin(&n->ha_lock); memcpy(dst, n->ha, dev->addr_len); } while (read_seqretry(&n->ha_lock, seq)); } static inline void neigh_update_is_router(struct neighbour *neigh, u32 flags, int *notify) { u8 ndm_flags = 0; ndm_flags |= (flags & NEIGH_UPDATE_F_ISROUTER) ? NTF_ROUTER : 0; if ((neigh->flags ^ ndm_flags) & NTF_ROUTER) { if (ndm_flags & NTF_ROUTER) neigh->flags |= NTF_ROUTER; else neigh->flags &= ~NTF_ROUTER; *notify = 1; } } #endif
68 68 68 68 68 68 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 // SPDX-License-Identifier: GPL-2.0-only /* * This only handles 32bit MTRR on 32bit hosts. This is strictly wrong * because MTRRs can span up to 40 bits (36bits on most modern x86) */ #include <linux/export.h> #include <linux/init.h> #include <linux/io.h> #include <linux/mm.h> #include <linux/cc_platform.h> #include <linux/string_choices.h> #include <asm/processor-flags.h> #include <asm/cacheinfo.h> #include <asm/cpufeature.h> #include <asm/cpu_device_id.h> #include <asm/hypervisor.h> #include <asm/mshyperv.h> #include <asm/tlbflush.h> #include <asm/mtrr.h> #include <asm/msr.h> #include <asm/memtype.h> #include "mtrr.h" struct fixed_range_block { int base_msr; /* start address of an MTRR block */ int ranges; /* number of MTRRs in this block */ }; static struct fixed_range_block fixed_range_blocks[] = { { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */ { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */ { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */ {} }; struct cache_map { u64 start; u64 end; u64 flags; u64 type:8; u64 fixed:1; }; bool mtrr_debug; static int __init mtrr_param_setup(char *str) { int rc = 0; if (!str) return -EINVAL; if (!strcmp(str, "debug")) mtrr_debug = true; else rc = -EINVAL; return rc; } early_param("mtrr", mtrr_param_setup); /* * CACHE_MAP_MAX is the maximum number of memory ranges in cache_map, where * no 2 adjacent ranges have the same cache mode (those would be merged). * The number is based on the worst case: * - no two adjacent fixed MTRRs share the same cache mode * - one variable MTRR is spanning a huge area with mode WB * - 255 variable MTRRs with mode UC all overlap with the WB MTRR, creating 2 * additional ranges each (result like "ababababa...aba" with a = WB, b = UC), * accounting for MTRR_MAX_VAR_RANGES * 2 - 1 range entries * - a TOP_MEM2 area (even with overlapping an UC MTRR can't add 2 range entries * to the possible maximum, as it always starts at 4GB, thus it can't be in * the middle of that MTRR, unless that MTRR starts at 0, which would remove * the initial "a" from the "abababa" pattern above) * The map won't contain ranges with no matching MTRR (those fall back to the * default cache mode). */ #define CACHE_MAP_MAX (MTRR_NUM_FIXED_RANGES + MTRR_MAX_VAR_RANGES * 2) static struct cache_map init_cache_map[CACHE_MAP_MAX] __initdata; static struct cache_map *cache_map __refdata = init_cache_map; static unsigned int cache_map_size = CACHE_MAP_MAX; static unsigned int cache_map_n; static unsigned int cache_map_fixed; static unsigned long smp_changes_mask; static int mtrr_state_set; u64 mtrr_tom2; struct mtrr_state_type mtrr_state; EXPORT_SYMBOL_GPL(mtrr_state); /* Reserved bits in the high portion of the MTRRphysBaseN MSR. */ u32 phys_hi_rsvd; /* * BIOS is expected to clear MtrrFixDramModEn bit, see for example * "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD * Opteron Processors" (26094 Rev. 3.30 February 2006), section * "13.2.1.2 SYSCFG Register": "The MtrrFixDramModEn bit should be set * to 1 during BIOS initialization of the fixed MTRRs, then cleared to * 0 for operation." */ static inline void k8_check_syscfg_dram_mod_en(void) { u32 lo, hi; if (!((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0x0f))) return; if (cc_platform_has(CC_ATTR_HOST_SEV_SNP)) return; rdmsr(MSR_AMD64_SYSCFG, lo, hi); if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) { pr_err(FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]" " not cleared by BIOS, clearing this bit\n", smp_processor_id()); lo &= ~K8_MTRRFIXRANGE_DRAM_MODIFY; mtrr_wrmsr(MSR_AMD64_SYSCFG, lo, hi); } } /* Get the size of contiguous MTRR range */ static u64 get_mtrr_size(u64 mask) { u64 size; mask |= (u64)phys_hi_rsvd << 32; size = -mask; return size; } static u8 get_var_mtrr_state(unsigned int reg, u64 *start, u64 *size) { struct mtrr_var_range *mtrr = mtrr_state.var_ranges + reg; if (!(mtrr->mask_lo & MTRR_PHYSMASK_V)) return MTRR_TYPE_INVALID; *start = (((u64)mtrr->base_hi) << 32) + (mtrr->base_lo & PAGE_MASK); *size = get_mtrr_size((((u64)mtrr->mask_hi) << 32) + (mtrr->mask_lo & PAGE_MASK)); return mtrr->base_lo & MTRR_PHYSBASE_TYPE; } static u8 get_effective_type(u8 type1, u8 type2) { if (type1 == MTRR_TYPE_UNCACHABLE || type2 == MTRR_TYPE_UNCACHABLE) return MTRR_TYPE_UNCACHABLE; if ((type1 == MTRR_TYPE_WRBACK && type2 == MTRR_TYPE_WRTHROUGH) || (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK)) return MTRR_TYPE_WRTHROUGH; if (type1 != type2) return MTRR_TYPE_UNCACHABLE; return type1; } static void rm_map_entry_at(int idx) { cache_map_n--; if (cache_map_n > idx) { memmove(cache_map + idx, cache_map + idx + 1, sizeof(*cache_map) * (cache_map_n - idx)); } } /* * Add an entry into cache_map at a specific index. Merges adjacent entries if * appropriate. Return the number of merges for correcting the scan index * (this is needed as merging will reduce the number of entries, which will * result in skipping entries in future iterations if the scan index isn't * corrected). * Note that the corrected index can never go below -1 (resulting in being 0 in * the next scan iteration), as "2" is returned only if the current index is * larger than zero. */ static int add_map_entry_at(u64 start, u64 end, u8 type, int idx) { bool merge_prev = false, merge_next = false; if (start >= end) return 0; if (idx > 0) { struct cache_map *prev = cache_map + idx - 1; if (!prev->fixed && start == prev->end && type == prev->type) merge_prev = true; } if (idx < cache_map_n) { struct cache_map *next = cache_map + idx; if (!next->fixed && end == next->start && type == next->type) merge_next = true; } if (merge_prev && merge_next) { cache_map[idx - 1].end = cache_map[idx].end; rm_map_entry_at(idx); return 2; } if (merge_prev) { cache_map[idx - 1].end = end; return 1; } if (merge_next) { cache_map[idx].start = start; return 1; } /* Sanity check: the array should NEVER be too small! */ if (cache_map_n == cache_map_size) { WARN(1, "MTRR cache mode memory map exhausted!\n"); cache_map_n = cache_map_fixed; return 0; } if (cache_map_n > idx) { memmove(cache_map + idx + 1, cache_map + idx, sizeof(*cache_map) * (cache_map_n - idx)); } cache_map[idx].start = start; cache_map[idx].end = end; cache_map[idx].type = type; cache_map[idx].fixed = 0; cache_map_n++; return 0; } /* Clear a part of an entry. Return 1 if start of entry is still valid. */ static int clr_map_range_at(u64 start, u64 end, int idx) { int ret = start != cache_map[idx].start; u64 tmp; if (start == cache_map[idx].start && end == cache_map[idx].end) { rm_map_entry_at(idx); } else if (start == cache_map[idx].start) { cache_map[idx].start = end; } else if (end == cache_map[idx].end) { cache_map[idx].end = start; } else { tmp = cache_map[idx].end; cache_map[idx].end = start; add_map_entry_at(end, tmp, cache_map[idx].type, idx + 1); } return ret; } /* * Add MTRR to the map. The current map is scanned and each part of the MTRR * either overlapping with an existing entry or with a hole in the map is * handled separately. */ static void add_map_entry(u64 start, u64 end, u8 type) { u8 new_type, old_type; u64 tmp; int i; for (i = 0; i < cache_map_n && start < end; i++) { if (start >= cache_map[i].end) continue; if (start < cache_map[i].start) { /* Region start has no overlap. */ tmp = min(end, cache_map[i].start); i -= add_map_entry_at(start, tmp, type, i); start = tmp; continue; } new_type = get_effective_type(type, cache_map[i].type); old_type = cache_map[i].type; if (cache_map[i].fixed || new_type == old_type) { /* Cut off start of new entry. */ start = cache_map[i].end; continue; } /* Handle only overlapping part of region. */ tmp = min(end, cache_map[i].end); i += clr_map_range_at(start, tmp, i); i -= add_map_entry_at(start, tmp, new_type, i); start = tmp; } /* Add rest of region after last map entry (rest might be empty). */ add_map_entry_at(start, end, type, i); } /* Add variable MTRRs to cache map. */ static void map_add_var(void) { u64 start, size; unsigned int i; u8 type; /* * Add AMD TOP_MEM2 area. Can't be added in mtrr_build_map(), as it * needs to be added again when rebuilding the map due to potentially * having moved as a result of variable MTRRs for memory below 4GB. */ if (mtrr_tom2) { add_map_entry(BIT_ULL(32), mtrr_tom2, MTRR_TYPE_WRBACK); cache_map[cache_map_n - 1].fixed = 1; } for (i = 0; i < num_var_ranges; i++) { type = get_var_mtrr_state(i, &start, &size); if (type != MTRR_TYPE_INVALID) add_map_entry(start, start + size, type); } } /* * Rebuild map by replacing variable entries. Needs to be called when MTRR * registers are being changed after boot, as such changes could include * removals of registers, which are complicated to handle without rebuild of * the map. */ void generic_rebuild_map(void) { if (mtrr_if != &generic_mtrr_ops) return; cache_map_n = cache_map_fixed; map_add_var(); } static unsigned int __init get_cache_map_size(void) { return cache_map_fixed + 2 * num_var_ranges + (mtrr_tom2 != 0); } /* Build the cache_map containing the cache modes per memory range. */ void __init mtrr_build_map(void) { u64 start, end, size; unsigned int i; u8 type; /* Add fixed MTRRs, optimize for adjacent entries with same type. */ if (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED) { /* * Start with 64k size fixed entries, preset 1st one (hence the * loop below is starting with index 1). */ start = 0; end = size = 0x10000; type = mtrr_state.fixed_ranges[0]; for (i = 1; i < MTRR_NUM_FIXED_RANGES; i++) { /* 8 64k entries, then 16 16k ones, rest 4k. */ if (i == 8 || i == 24) size >>= 2; if (mtrr_state.fixed_ranges[i] != type) { add_map_entry(start, end, type); start = end; type = mtrr_state.fixed_ranges[i]; } end += size; } add_map_entry(start, end, type); } /* Mark fixed, they take precedence. */ for (i = 0; i < cache_map_n; i++) cache_map[i].fixed = 1; cache_map_fixed = cache_map_n; map_add_var(); pr_info("MTRR map: %u entries (%u fixed + %u variable; max %u), built from %u variable MTRRs\n", cache_map_n, cache_map_fixed, cache_map_n - cache_map_fixed, get_cache_map_size(), num_var_ranges + (mtrr_tom2 != 0)); if (mtrr_debug) { for (i = 0; i < cache_map_n; i++) { pr_info("%3u: %016llx-%016llx %s\n", i, cache_map[i].start, cache_map[i].end - 1, mtrr_attrib_to_str(cache_map[i].type)); } } } /* Copy the cache_map from __initdata memory to dynamically allocated one. */ void __init mtrr_copy_map(void) { unsigned int new_size = get_cache_map_size(); if (!mtrr_state.enabled || !new_size) { cache_map = NULL; return; } mutex_lock(&mtrr_mutex); cache_map = kcalloc(new_size, sizeof(*cache_map), GFP_KERNEL); if (cache_map) { memmove(cache_map, init_cache_map, cache_map_n * sizeof(*cache_map)); cache_map_size = new_size; } else { mtrr_state.enabled = 0; pr_err("MTRRs disabled due to allocation failure for lookup map.\n"); } mutex_unlock(&mtrr_mutex); } /** * guest_force_mtrr_state - set static MTRR state for a guest * * Used to set MTRR state via different means (e.g. with data obtained from * a hypervisor). * Is allowed only for special cases when running virtualized. Must be called * from the x86_init.hyper.init_platform() hook. It can be called only once. * The MTRR state can't be changed afterwards. To ensure that, X86_FEATURE_MTRR * is cleared. * * @var: MTRR variable range array to use * @num_var: length of the @var array * @def_type: default caching type */ void guest_force_mtrr_state(struct mtrr_var_range *var, unsigned int num_var, mtrr_type def_type) { unsigned int i; /* Only allowed to be called once before mtrr_bp_init(). */ if (WARN_ON_ONCE(mtrr_state_set)) return; /* Only allowed when running virtualized. */ if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) return; /* * Only allowed for special virtualization cases: * - when running as Hyper-V, SEV-SNP guest using vTOM * - when running as Xen PV guest * - when running as SEV-SNP or TDX guest to avoid unnecessary * VMM communication/Virtualization exceptions (#VC, #VE) */ if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP) && !hv_is_isolation_supported() && !cpu_feature_enabled(X86_FEATURE_XENPV) && !cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) return; /* Disable MTRR in order to disable MTRR modifications. */ setup_clear_cpu_cap(X86_FEATURE_MTRR); if (var) { if (num_var > MTRR_MAX_VAR_RANGES) { pr_warn("Trying to overwrite MTRR state with %u variable entries\n", num_var); num_var = MTRR_MAX_VAR_RANGES; } for (i = 0; i < num_var; i++) mtrr_state.var_ranges[i] = var[i]; num_var_ranges = num_var; } mtrr_state.def_type = def_type; mtrr_state.enabled |= MTRR_STATE_MTRR_ENABLED; mtrr_state_set = 1; } static u8 type_merge(u8 type, u8 new_type, u8 *uniform) { u8 effective_type; if (type == MTRR_TYPE_INVALID) return new_type; effective_type = get_effective_type(type, new_type); if (type != effective_type) *uniform = 0; return effective_type; } /** * mtrr_type_lookup - look up memory type in MTRR * * @start: Begin of the physical address range * @end: End of the physical address range * @uniform: output argument: * - 1: the returned MTRR type is valid for the whole region * - 0: otherwise * * Return Values: * MTRR_TYPE_(type) - The effective MTRR type for the region * MTRR_TYPE_INVALID - MTRR is disabled */ u8 mtrr_type_lookup(u64 start, u64 end, u8 *uniform) { u8 type = MTRR_TYPE_INVALID; unsigned int i; if (!mtrr_state_set) { /* Uniformity is unknown. */ *uniform = 0; return MTRR_TYPE_UNCACHABLE; } *uniform = 1; if (!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED)) return MTRR_TYPE_UNCACHABLE; for (i = 0; i < cache_map_n && start < end; i++) { /* Region after current map entry? -> continue with next one. */ if (start >= cache_map[i].end) continue; /* Start of region not covered by current map entry? */ if (start < cache_map[i].start) { /* At least some part of region has default type. */ type = type_merge(type, mtrr_state.def_type, uniform); /* End of region not covered, too? -> lookup done. */ if (end <= cache_map[i].start) return type; } /* At least part of region covered by map entry. */ type = type_merge(type, cache_map[i].type, uniform); start = cache_map[i].end; } /* End of region past last entry in map? -> use default type. */ if (start < end) type = type_merge(type, mtrr_state.def_type, uniform); return type; } /* Get the MSR pair relating to a var range */ static void get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) { rdmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi); rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); } /* Fill the MSR pair relating to a var range */ void fill_mtrr_var_range(unsigned int index, u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi) { struct mtrr_var_range *vr; vr = mtrr_state.var_ranges; vr[index].base_lo = base_lo; vr[index].base_hi = base_hi; vr[index].mask_lo = mask_lo; vr[index].mask_hi = mask_hi; } static void get_fixed_ranges(mtrr_type *frs) { unsigned int *p = (unsigned int *)frs; int i; k8_check_syscfg_dram_mod_en(); rdmsr(MSR_MTRRfix64K_00000, p[0], p[1]); for (i = 0; i < 2; i++) rdmsr(MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]); for (i = 0; i < 8; i++) rdmsr(MSR_MTRRfix4K_C0000 + i, p[6 + i * 2], p[7 + i * 2]); } void mtrr_save_fixed_ranges(void *info) { if (boot_cpu_has(X86_FEATURE_MTRR)) get_fixed_ranges(mtrr_state.fixed_ranges); } static unsigned __initdata last_fixed_start; static unsigned __initdata last_fixed_end; static mtrr_type __initdata last_fixed_type; static void __init print_fixed_last(void) { if (!last_fixed_end) return; pr_info(" %05X-%05X %s\n", last_fixed_start, last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type)); last_fixed_end = 0; } static void __init update_fixed_last(unsigned base, unsigned end, mtrr_type type) { last_fixed_start = base; last_fixed_end = end; last_fixed_type = type; } static void __init print_fixed(unsigned base, unsigned step, const mtrr_type *types) { unsigned i; for (i = 0; i < 8; ++i, ++types, base += step) { if (last_fixed_end == 0) { update_fixed_last(base, base + step, *types); continue; } if (last_fixed_end == base && last_fixed_type == *types) { last_fixed_end = base + step; continue; } /* new segments: gap or different type */ print_fixed_last(); update_fixed_last(base, base + step, *types); } } static void __init print_mtrr_state(void) { unsigned int i; int high_width; pr_info("MTRR default type: %s\n", mtrr_attrib_to_str(mtrr_state.def_type)); if (mtrr_state.have_fixed) { pr_info("MTRR fixed ranges %s:\n", str_enabled_disabled( (mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) && (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED))); print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0); for (i = 0; i < 2; ++i) print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8); for (i = 0; i < 8; ++i) print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8); /* tail */ print_fixed_last(); } pr_info("MTRR variable ranges %s:\n", str_enabled_disabled(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED)); high_width = (boot_cpu_data.x86_phys_bits - (32 - PAGE_SHIFT) + 3) / 4; for (i = 0; i < num_var_ranges; ++i) { if (mtrr_state.var_ranges[i].mask_lo & MTRR_PHYSMASK_V) pr_info(" %u base %0*X%05X000 mask %0*X%05X000 %s\n", i, high_width, mtrr_state.var_ranges[i].base_hi, mtrr_state.var_ranges[i].base_lo >> 12, high_width, mtrr_state.var_ranges[i].mask_hi, mtrr_state.var_ranges[i].mask_lo >> 12, mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & MTRR_PHYSBASE_TYPE)); else pr_info(" %u disabled\n", i); } if (mtrr_tom2) pr_info("TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20); } /* Grab all of the MTRR state for this CPU into *state */ bool __init get_mtrr_state(void) { struct mtrr_var_range *vrs; unsigned lo, dummy; unsigned int i; vrs = mtrr_state.var_ranges; rdmsr(MSR_MTRRcap, lo, dummy); mtrr_state.have_fixed = lo & MTRR_CAP_FIX; for (i = 0; i < num_var_ranges; i++) get_mtrr_var_range(i, &vrs[i]); if (mtrr_state.have_fixed) get_fixed_ranges(mtrr_state.fixed_ranges); rdmsr(MSR_MTRRdefType, lo, dummy); mtrr_state.def_type = lo & MTRR_DEF_TYPE_TYPE; mtrr_state.enabled = (lo & MTRR_DEF_TYPE_ENABLE) >> MTRR_STATE_SHIFT; if (amd_special_default_mtrr()) { unsigned low, high; /* TOP_MEM2 */ rdmsr(MSR_K8_TOP_MEM2, low, high); mtrr_tom2 = high; mtrr_tom2 <<= 32; mtrr_tom2 |= low; mtrr_tom2 &= 0xffffff800000ULL; } if (mtrr_debug) print_mtrr_state(); mtrr_state_set = 1; return !!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED); } /* Some BIOS's are messed up and don't set all MTRRs the same! */ void __init mtrr_state_warn(void) { unsigned long mask = smp_changes_mask; if (!mask) return; if (mask & MTRR_CHANGE_MASK_FIXED) pr_warn("mtrr: your CPUs had inconsistent fixed MTRR settings\n"); if (mask & MTRR_CHANGE_MASK_VARIABLE) pr_warn("mtrr: your CPUs had inconsistent variable MTRR settings\n"); if (mask & MTRR_CHANGE_MASK_DEFTYPE) pr_warn("mtrr: your CPUs had inconsistent MTRRdefType settings\n"); pr_info("mtrr: probably your BIOS does not setup all CPUs.\n"); pr_info("mtrr: corrected configuration.\n"); } /* * Doesn't attempt to pass an error out to MTRR users * because it's quite complicated in some cases and probably not * worth it because the best error handling is to ignore it. */ void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b) { if (wrmsr_safe(msr, a, b) < 0) { pr_err("MTRR: CPU %u: Writing MSR %x to %x:%x failed\n", smp_processor_id(), msr, a, b); } } /** * set_fixed_range - checks & updates a fixed-range MTRR if it * differs from the value it should have * @msr: MSR address of the MTTR which should be checked and updated * @changed: pointer which indicates whether the MTRR needed to be changed * @msrwords: pointer to the MSR values which the MSR should have */ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords) { unsigned lo, hi; rdmsr(msr, lo, hi); if (lo != msrwords[0] || hi != msrwords[1]) { mtrr_wrmsr(msr, msrwords[0], msrwords[1]); *changed = true; } } /** * generic_get_free_region - Get a free MTRR. * @base: The starting (base) address of the region. * @size: The size (in bytes) of the region. * @replace_reg: mtrr index to be replaced; set to invalid value if none. * * Returns: The index of the region on success, else negative on error. */ int generic_get_free_region(unsigned long base, unsigned long size, int replace_reg) { unsigned long lbase, lsize; mtrr_type ltype; int i, max; max = num_var_ranges; if (replace_reg >= 0 && replace_reg < max) return replace_reg; for (i = 0; i < max; ++i) { mtrr_if->get(i, &lbase, &lsize, &ltype); if (lsize == 0) return i; } return -ENOSPC; } static void generic_get_mtrr(unsigned int reg, unsigned long *base, unsigned long *size, mtrr_type *type) { u32 mask_lo, mask_hi, base_lo, base_hi; unsigned int hi; u64 tmp, mask; /* * get_mtrr doesn't need to update mtrr_state, also it could be called * from any cpu, so try to print it out directly. */ get_cpu(); rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); if (!(mask_lo & MTRR_PHYSMASK_V)) { /* Invalid (i.e. free) range */ *base = 0; *size = 0; *type = 0; goto out_put_cpu; } rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi); /* Work out the shifted address mask: */ tmp = (u64)mask_hi << 32 | (mask_lo & PAGE_MASK); mask = (u64)phys_hi_rsvd << 32 | tmp; /* Expand tmp with high bits to all 1s: */ hi = fls64(tmp); if (hi > 0) { tmp |= ~((1ULL<<(hi - 1)) - 1); if (tmp != mask) { pr_warn("mtrr: your BIOS has configured an incorrect mask, fixing it.\n"); add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); mask = tmp; } } /* * This works correctly if size is a power of two, i.e. a * contiguous range: */ *size = -mask >> PAGE_SHIFT; *base = (u64)base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT; *type = base_lo & MTRR_PHYSBASE_TYPE; out_put_cpu: put_cpu(); } /** * set_fixed_ranges - checks & updates the fixed-range MTRRs if they * differ from the saved set * @frs: pointer to fixed-range MTRR values, saved by get_fixed_ranges() */ static int set_fixed_ranges(mtrr_type *frs) { unsigned long long *saved = (unsigned long long *)frs; bool changed = false; int block = -1, range; k8_check_syscfg_dram_mod_en(); while (fixed_range_blocks[++block].ranges) { for (range = 0; range < fixed_range_blocks[block].ranges; range++) set_fixed_range(fixed_range_blocks[block].base_msr + range, &changed, (unsigned int *)saved++); } return changed; } /* * Set the MSR pair relating to a var range. * Returns true if changes are made. */ static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) { unsigned int lo, hi; bool changed = false; rdmsr(MTRRphysBase_MSR(index), lo, hi); if ((vr->base_lo & ~MTRR_PHYSBASE_RSVD) != (lo & ~MTRR_PHYSBASE_RSVD) || (vr->base_hi & ~phys_hi_rsvd) != (hi & ~phys_hi_rsvd)) { mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi); changed = true; } rdmsr(MTRRphysMask_MSR(index), lo, hi); if ((vr->mask_lo & ~MTRR_PHYSMASK_RSVD) != (lo & ~MTRR_PHYSMASK_RSVD) || (vr->mask_hi & ~phys_hi_rsvd) != (hi & ~phys_hi_rsvd)) { mtrr_wrmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); changed = true; } return changed; } static u32 deftype_lo, deftype_hi; /** * set_mtrr_state - Set the MTRR state for this CPU. * * NOTE: The CPU must already be in a safe state for MTRR changes, including * measures that only a single CPU can be active in set_mtrr_state() in * order to not be subject to races for usage of deftype_lo. This is * accomplished by taking cache_disable_lock. * RETURNS: 0 if no changes made, else a mask indicating what was changed. */ static unsigned long set_mtrr_state(void) { unsigned long change_mask = 0; unsigned int i; for (i = 0; i < num_var_ranges; i++) { if (set_mtrr_var_ranges(i, &mtrr_state.var_ranges[i])) change_mask |= MTRR_CHANGE_MASK_VARIABLE; } if (mtrr_state.have_fixed && set_fixed_ranges(mtrr_state.fixed_ranges)) change_mask |= MTRR_CHANGE_MASK_FIXED; /* * Set_mtrr_restore restores the old value of MTRRdefType, * so to set it we fiddle with the saved value: */ if ((deftype_lo & MTRR_DEF_TYPE_TYPE) != mtrr_state.def_type || ((deftype_lo & MTRR_DEF_TYPE_ENABLE) >> MTRR_STATE_SHIFT) != mtrr_state.enabled) { deftype_lo = (deftype_lo & MTRR_DEF_TYPE_DISABLE) | mtrr_state.def_type | (mtrr_state.enabled << MTRR_STATE_SHIFT); change_mask |= MTRR_CHANGE_MASK_DEFTYPE; } return change_mask; } void mtrr_disable(void) { /* Save MTRR state */ rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); /* Disable MTRRs, and set the default type to uncached */ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & MTRR_DEF_TYPE_DISABLE, deftype_hi); } void mtrr_enable(void) { /* Intel (P6) standard MTRRs */ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); } void mtrr_generic_set_state(void) { unsigned long mask, count; /* Actually set the state */ mask = set_mtrr_state(); /* Use the atomic bitops to update the global mask */ for (count = 0; count < sizeof(mask) * 8; ++count) { if (mask & 0x01) set_bit(count, &smp_changes_mask); mask >>= 1; } } /** * generic_set_mtrr - set variable MTRR register on the local CPU. * * @reg: The register to set. * @base: The base address of the region. * @size: The size of the region. If this is 0 the region is disabled. * @type: The type of the region. * * Returns nothing. */ static void generic_set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) { unsigned long flags; struct mtrr_var_range *vr; vr = &mtrr_state.var_ranges[reg]; local_irq_save(flags); cache_disable(); if (size == 0) { /* * The invalid bit is kept in the mask, so we simply * clear the relevant mask register to disable a range. */ mtrr_wrmsr(MTRRphysMask_MSR(reg), 0, 0); memset(vr, 0, sizeof(struct mtrr_var_range)); } else { vr->base_lo = base << PAGE_SHIFT | type; vr->base_hi = (base >> (32 - PAGE_SHIFT)) & ~phys_hi_rsvd; vr->mask_lo = -size << PAGE_SHIFT | MTRR_PHYSMASK_V; vr->mask_hi = (-size >> (32 - PAGE_SHIFT)) & ~phys_hi_rsvd; mtrr_wrmsr(MTRRphysBase_MSR(reg), vr->base_lo, vr->base_hi); mtrr_wrmsr(MTRRphysMask_MSR(reg), vr->mask_lo, vr->mask_hi); } cache_enable(); local_irq_restore(flags); } int generic_validate_add_page(unsigned long base, unsigned long size, unsigned int type) { unsigned long lbase, last; /* * For Intel PPro stepping <= 7 * must be 4 MiB aligned and not touch 0x70000000 -> 0x7003FFFF */ if (mtrr_if == &generic_mtrr_ops && boot_cpu_data.x86_vfm == INTEL_PENTIUM_PRO && boot_cpu_data.x86_stepping <= 7) { if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) { pr_warn("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base); return -EINVAL; } if (!(base + size < 0x70000 || base > 0x7003F) && (type == MTRR_TYPE_WRCOMB || type == MTRR_TYPE_WRBACK)) { pr_warn("mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n"); return -EINVAL; } } /* * Check upper bits of base and last are equal and lower bits are 0 * for base and 1 for last */ last = base + size - 1; for (lbase = base; !(lbase & 1) && (last & 1); lbase = lbase >> 1, last = last >> 1) ; if (lbase != last) { pr_warn("mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", base, size); return -EINVAL; } return 0; } static int generic_have_wrcomb(void) { unsigned long config, dummy; rdmsr(MSR_MTRRcap, config, dummy); return config & MTRR_CAP_WC; } int positive_have_wrcomb(void) { return 1; } /* * Generic structure... */ const struct mtrr_ops generic_mtrr_ops = { .get = generic_get_mtrr, .get_free_region = generic_get_free_region, .set = generic_set_mtrr, .validate_add_page = generic_validate_add_page, .have_wrcomb = generic_have_wrcomb, };
114 114 114 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/nfs/inode.c * * Copyright (C) 1992 Rick Sladkey * * nfs inode and superblock handling functions * * Modularised by Alan Cox <alan@lxorguk.ukuu.org.uk>, while hacking some * experimental NFS changes. Modularisation taken straight from SYS5 fs. * * Change to nfs_read_super() to permit NFS mounts to multi-homed hosts. * J.S.Peatfield@damtp.cam.ac.uk * */ #include <linux/module.h> #include <linux/init.h> #include <linux/sched/signal.h> #include <linux/time.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/stat.h> #include <linux/errno.h> #include <linux/unistd.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/metrics.h> #include <linux/nfs_fs.h> #include <linux/nfs_mount.h> #include <linux/nfs4_mount.h> #include <linux/lockd/bind.h> #include <linux/seq_file.h> #include <linux/mount.h> #include <linux/vfs.h> #include <linux/inet.h> #include <linux/nfs_xdr.h> #include <linux/slab.h> #include <linux/compat.h> #include <linux/freezer.h> #include <linux/uaccess.h> #include <linux/iversion.h> #include "nfs4_fs.h" #include "callback.h" #include "delegation.h" #include "iostat.h" #include "internal.h" #include "fscache.h" #include "pnfs.h" #include "nfs.h" #include "netns.h" #include "sysfs.h" #include "nfstrace.h" #define NFSDBG_FACILITY NFSDBG_VFS #define NFS_64_BIT_INODE_NUMBERS_ENABLED 1 /* Default is to see 64-bit inode numbers */ static bool enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED; static int nfs_update_inode(struct inode *, struct nfs_fattr *); static struct kmem_cache * nfs_inode_cachep; static inline unsigned long nfs_fattr_to_ino_t(struct nfs_fattr *fattr) { return nfs_fileid_to_ino_t(fattr->fileid); } int nfs_wait_bit_killable(struct wait_bit_key *key, int mode) { if (unlikely(nfs_current_task_exiting())) return -EINTR; schedule(); if (signal_pending_state(mode, current)) return -ERESTARTSYS; return 0; } EXPORT_SYMBOL_GPL(nfs_wait_bit_killable); /** * nfs_compat_user_ino64 - returns the user-visible inode number * @fileid: 64-bit fileid * * This function returns a 32-bit inode number if the boot parameter * nfs.enable_ino64 is zero. */ u64 nfs_compat_user_ino64(u64 fileid) { #ifdef CONFIG_COMPAT compat_ulong_t ino; #else unsigned long ino; #endif if (enable_ino64) return fileid; ino = fileid; if (sizeof(ino) < sizeof(fileid)) ino ^= fileid >> (sizeof(fileid)-sizeof(ino)) * 8; return ino; } int nfs_drop_inode(struct inode *inode) { return NFS_STALE(inode) || generic_drop_inode(inode); } EXPORT_SYMBOL_GPL(nfs_drop_inode); void nfs_clear_inode(struct inode *inode) { /* * The following should never happen... */ WARN_ON_ONCE(nfs_have_writebacks(inode)); WARN_ON_ONCE(!list_empty(&NFS_I(inode)->open_files)); nfs_zap_acl_cache(inode); nfs_access_zap_cache(inode); nfs_fscache_clear_inode(inode); } EXPORT_SYMBOL_GPL(nfs_clear_inode); void nfs_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); clear_inode(inode); nfs_clear_inode(inode); } int nfs_sync_inode(struct inode *inode) { inode_dio_wait(inode); return nfs_wb_all(inode); } EXPORT_SYMBOL_GPL(nfs_sync_inode); /** * nfs_sync_mapping - helper to flush all mmapped dirty data to disk * @mapping: pointer to struct address_space */ int nfs_sync_mapping(struct address_space *mapping) { int ret = 0; if (mapping->nrpages != 0) { unmap_mapping_range(mapping, 0, 0, 0); ret = nfs_wb_all(mapping->host); } return ret; } static int nfs_attribute_timeout(struct inode *inode) { struct nfs_inode *nfsi = NFS_I(inode); return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo); } static bool nfs_check_cache_flags_invalid(struct inode *inode, unsigned long flags) { unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity); return (cache_validity & flags) != 0; } bool nfs_check_cache_invalid(struct inode *inode, unsigned long flags) { if (nfs_check_cache_flags_invalid(inode, flags)) return true; return nfs_attribute_cache_expired(inode); } EXPORT_SYMBOL_GPL(nfs_check_cache_invalid); #ifdef CONFIG_NFS_V4_2 static bool nfs_has_xattr_cache(const struct nfs_inode *nfsi) { return nfsi->xattr_cache != NULL; } #else static bool nfs_has_xattr_cache(const struct nfs_inode *nfsi) { return false; } #endif void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) { struct nfs_inode *nfsi = NFS_I(inode); if (nfs_have_delegated_attributes(inode)) { if (!(flags & NFS_INO_REVAL_FORCED)) flags &= ~(NFS_INO_INVALID_MODE | NFS_INO_INVALID_OTHER | NFS_INO_INVALID_XATTR); flags &= ~(NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE); } if (!nfs_has_xattr_cache(nfsi)) flags &= ~NFS_INO_INVALID_XATTR; if (flags & NFS_INO_INVALID_DATA) nfs_fscache_invalidate(inode, 0); flags &= ~NFS_INO_REVAL_FORCED; flags |= nfsi->cache_validity; if (inode->i_mapping->nrpages == 0) flags &= ~NFS_INO_INVALID_DATA; /* pairs with nfs_clear_invalid_mapping()'s smp_load_acquire() */ smp_store_release(&nfsi->cache_validity, flags); if (inode->i_mapping->nrpages == 0 || nfsi->cache_validity & NFS_INO_INVALID_DATA) { nfs_ooo_clear(nfsi); } trace_nfs_set_cache_invalid(inode, 0); } EXPORT_SYMBOL_GPL(nfs_set_cache_invalid); /* * Invalidate the local caches */ static void nfs_zap_caches_locked(struct inode *inode) { struct nfs_inode *nfsi = NFS_I(inode); int mode = inode->i_mode; nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = jiffies; if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR | NFS_INO_INVALID_DATA | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | NFS_INO_INVALID_XATTR); else nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | NFS_INO_INVALID_XATTR); nfs_zap_label_cache_locked(nfsi); } void nfs_zap_caches(struct inode *inode) { spin_lock(&inode->i_lock); nfs_zap_caches_locked(inode); spin_unlock(&inode->i_lock); } void nfs_zap_mapping(struct inode *inode, struct address_space *mapping) { if (mapping->nrpages != 0) { spin_lock(&inode->i_lock); nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); spin_unlock(&inode->i_lock); } } void nfs_zap_acl_cache(struct inode *inode) { void (*clear_acl_cache)(struct inode *); clear_acl_cache = NFS_PROTO(inode)->clear_acl_cache; if (clear_acl_cache != NULL) clear_acl_cache(inode); spin_lock(&inode->i_lock); NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_ACL; spin_unlock(&inode->i_lock); } EXPORT_SYMBOL_GPL(nfs_zap_acl_cache); void nfs_invalidate_atime(struct inode *inode) { if (nfs_have_delegated_atime(inode)) return; spin_lock(&inode->i_lock); nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL_GPL(nfs_invalidate_atime); /* * Invalidate, but do not unhash, the inode. * NB: must be called with inode->i_lock held! */ static void nfs_set_inode_stale_locked(struct inode *inode) { set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); nfs_zap_caches_locked(inode); trace_nfs_set_inode_stale(inode); } void nfs_set_inode_stale(struct inode *inode) { spin_lock(&inode->i_lock); nfs_set_inode_stale_locked(inode); spin_unlock(&inode->i_lock); } struct nfs_find_desc { struct nfs_fh *fh; struct nfs_fattr *fattr; }; /* * In NFSv3 we can have 64bit inode numbers. In order to support * this, and re-exported directories (also seen in NFSv2) * we are forced to allow 2 different inodes to have the same * i_ino. */ static int nfs_find_actor(struct inode *inode, void *opaque) { struct nfs_find_desc *desc = opaque; struct nfs_fh *fh = desc->fh; struct nfs_fattr *fattr = desc->fattr; if (NFS_FILEID(inode) != fattr->fileid) return 0; if (inode_wrong_type(inode, fattr->mode)) return 0; if (nfs_compare_fh(NFS_FH(inode), fh)) return 0; if (is_bad_inode(inode) || NFS_STALE(inode)) return 0; return 1; } static int nfs_init_locked(struct inode *inode, void *opaque) { struct nfs_find_desc *desc = opaque; struct nfs_fattr *fattr = desc->fattr; set_nfs_fileid(inode, fattr->fileid); inode->i_mode = fattr->mode; nfs_copy_fh(NFS_FH(inode), desc->fh); return 0; } #ifdef CONFIG_NFS_V4_SECURITY_LABEL static void nfs_clear_label_invalid(struct inode *inode) { spin_lock(&inode->i_lock); NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_LABEL; spin_unlock(&inode->i_lock); } void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr) { int error; if (fattr->label == NULL) return; if ((fattr->valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL) && inode->i_security) { error = security_inode_notifysecctx(inode, fattr->label->label, fattr->label->len); if (error) printk(KERN_ERR "%s() %s %d " "security_inode_notifysecctx() %d\n", __func__, (char *)fattr->label->label, fattr->label->len, error); nfs_clear_label_invalid(inode); } } struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags) { struct nfs4_label *label; if (!(server->caps & NFS_CAP_SECURITY_LABEL)) return NULL; label = kzalloc(sizeof(struct nfs4_label), flags); if (label == NULL) return ERR_PTR(-ENOMEM); label->label = kzalloc(NFS4_MAXLABELLEN, flags); if (label->label == NULL) { kfree(label); return ERR_PTR(-ENOMEM); } label->len = NFS4_MAXLABELLEN; return label; } EXPORT_SYMBOL_GPL(nfs4_label_alloc); #else void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr) { } #endif EXPORT_SYMBOL_GPL(nfs_setsecurity); /* Search for inode identified by fh, fileid and i_mode in inode cache. */ struct inode * nfs_ilookup(struct super_block *sb, struct nfs_fattr *fattr, struct nfs_fh *fh) { struct nfs_find_desc desc = { .fh = fh, .fattr = fattr, }; struct inode *inode; unsigned long hash; if (!(fattr->valid & NFS_ATTR_FATTR_FILEID) || !(fattr->valid & NFS_ATTR_FATTR_TYPE)) return NULL; hash = nfs_fattr_to_ino_t(fattr); inode = ilookup5(sb, hash, nfs_find_actor, &desc); dprintk("%s: returning %p\n", __func__, inode); return inode; } static void nfs_inode_init_regular(struct nfs_inode *nfsi) { atomic_long_set(&nfsi->nrequests, 0); atomic_long_set(&nfsi->redirtied_pages, 0); INIT_LIST_HEAD(&nfsi->commit_info.list); atomic_long_set(&nfsi->commit_info.ncommit, 0); atomic_set(&nfsi->commit_info.rpcs_out, 0); mutex_init(&nfsi->commit_mutex); } static void nfs_inode_init_dir(struct nfs_inode *nfsi) { nfsi->cache_change_attribute = 0; memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); init_rwsem(&nfsi->rmdir_sem); } /* * This is our front-end to iget that looks up inodes by file handle * instead of inode number. */ struct inode * nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) { struct nfs_find_desc desc = { .fh = fh, .fattr = fattr }; struct inode *inode = ERR_PTR(-ENOENT); u64 fattr_supported = NFS_SB(sb)->fattr_valid; unsigned long hash; nfs_attr_check_mountpoint(sb, fattr); if (nfs_attr_use_mounted_on_fileid(fattr)) fattr->fileid = fattr->mounted_on_fileid; else if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0) goto out_no_inode; if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0) goto out_no_inode; hash = nfs_fattr_to_ino_t(fattr); inode = iget5_locked(sb, hash, nfs_find_actor, nfs_init_locked, &desc); if (inode == NULL) { inode = ERR_PTR(-ENOMEM); goto out_no_inode; } if (inode->i_state & I_NEW) { struct nfs_inode *nfsi = NFS_I(inode); unsigned long now = jiffies; /* We set i_ino for the few things that still rely on it, * such as stat(2) */ inode->i_ino = hash; /* We can't support update_atime(), since the server will reset it */ inode->i_flags |= S_NOATIME|S_NOCMTIME; inode->i_mode = fattr->mode; nfsi->cache_validity = 0; if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0 && (fattr_supported & NFS_ATTR_FATTR_MODE)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE); /* Why so? Because we want revalidate for devices/FIFOs, and * that's precisely what we have in nfs_file_inode_operations. */ inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops; if (S_ISREG(inode->i_mode)) { inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops; inode->i_data.a_ops = &nfs_file_aops; nfs_inode_init_regular(nfsi); mapping_set_large_folios(inode->i_mapping); } else if (S_ISDIR(inode->i_mode)) { inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops; inode->i_fop = &nfs_dir_operations; inode->i_data.a_ops = &nfs_dir_aops; nfs_inode_init_dir(nfsi); /* Deal with crossing mountpoints */ if (fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT || fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) { if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) inode->i_op = &nfs_referral_inode_operations; else inode->i_op = &nfs_mountpoint_inode_operations; inode->i_fop = NULL; inode->i_flags |= S_AUTOMOUNT; } } else if (S_ISLNK(inode->i_mode)) { inode->i_op = &nfs_symlink_inode_operations; inode_nohighmem(inode); } else init_special_inode(inode, inode->i_mode, fattr->rdev); inode_set_atime(inode, 0, 0); inode_set_mtime(inode, 0, 0); inode_set_ctime(inode, 0, 0); inode_set_iversion_raw(inode, 0); inode->i_size = 0; clear_nlink(inode); inode->i_uid = make_kuid(&init_user_ns, -2); inode->i_gid = make_kgid(&init_user_ns, -2); inode->i_blocks = 0; nfsi->write_io = 0; nfsi->read_io = 0; nfsi->read_cache_jiffies = fattr->time_start; nfsi->attr_gencount = fattr->gencount; if (fattr->valid & NFS_ATTR_FATTR_ATIME) inode_set_atime_to_ts(inode, fattr->atime); else if (fattr_supported & NFS_ATTR_FATTR_ATIME) nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME); if (fattr->valid & NFS_ATTR_FATTR_MTIME) inode_set_mtime_to_ts(inode, fattr->mtime); else if (fattr_supported & NFS_ATTR_FATTR_MTIME) nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME); if (fattr->valid & NFS_ATTR_FATTR_CTIME) inode_set_ctime_to_ts(inode, fattr->ctime); else if (fattr_supported & NFS_ATTR_FATTR_CTIME) nfs_set_cache_invalid(inode, NFS_INO_INVALID_CTIME); if (fattr->valid & NFS_ATTR_FATTR_CHANGE) inode_set_iversion_raw(inode, fattr->change_attr); else nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE); if (fattr->valid & NFS_ATTR_FATTR_SIZE) inode->i_size = nfs_size_to_loff_t(fattr->size); else nfs_set_cache_invalid(inode, NFS_INO_INVALID_SIZE); if (fattr->valid & NFS_ATTR_FATTR_NLINK) set_nlink(inode, fattr->nlink); else if (fattr_supported & NFS_ATTR_FATTR_NLINK) nfs_set_cache_invalid(inode, NFS_INO_INVALID_NLINK); if (fattr->valid & NFS_ATTR_FATTR_OWNER) inode->i_uid = fattr->uid; else if (fattr_supported & NFS_ATTR_FATTR_OWNER) nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); if (fattr->valid & NFS_ATTR_FATTR_GROUP) inode->i_gid = fattr->gid; else if (fattr_supported & NFS_ATTR_FATTR_GROUP) nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) inode->i_blocks = fattr->du.nfs2.blocks; else if (fattr_supported & NFS_ATTR_FATTR_BLOCKS_USED && fattr->size != 0) nfs_set_cache_invalid(inode, NFS_INO_INVALID_BLOCKS); if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { /* * report the blocks in 512byte units */ inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); } else if (fattr_supported & NFS_ATTR_FATTR_SPACE_USED && fattr->size != 0) nfs_set_cache_invalid(inode, NFS_INO_INVALID_BLOCKS); nfs_setsecurity(inode, fattr); nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; nfsi->access_cache = RB_ROOT; nfs_fscache_init_inode(inode); unlock_new_inode(inode); } else { int err = nfs_refresh_inode(inode, fattr); if (err < 0) { iput(inode); inode = ERR_PTR(err); goto out_no_inode; } } dprintk("NFS: nfs_fhget(%s/%Lu fh_crc=0x%08x ct=%d)\n", inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode), nfs_display_fhandle_hash(fh), atomic_read(&inode->i_count)); out: return inode; out_no_inode: dprintk("nfs_fhget: iget failed with error %ld\n", PTR_ERR(inode)); goto out; } EXPORT_SYMBOL_GPL(nfs_fhget); static void nfs_fattr_fixup_delegated(struct inode *inode, struct nfs_fattr *fattr) { unsigned long cache_validity = NFS_I(inode)->cache_validity; if (nfs_have_delegated_mtime(inode)) { if (!(cache_validity & NFS_INO_INVALID_CTIME)) fattr->valid &= ~(NFS_ATTR_FATTR_PRECTIME | NFS_ATTR_FATTR_CTIME); if (!(cache_validity & NFS_INO_INVALID_MTIME)) fattr->valid &= ~(NFS_ATTR_FATTR_PREMTIME | NFS_ATTR_FATTR_MTIME); if (!(cache_validity & NFS_INO_INVALID_ATIME)) fattr->valid &= ~NFS_ATTR_FATTR_ATIME; } else if (nfs_have_delegated_atime(inode)) { if (!(cache_validity & NFS_INO_INVALID_ATIME)) fattr->valid &= ~NFS_ATTR_FATTR_ATIME; } } static void nfs_update_timestamps(struct inode *inode, unsigned int ia_valid) { enum file_time_flags time_flags = 0; unsigned int cache_flags = 0; if (ia_valid & ATTR_MTIME) { time_flags |= S_MTIME | S_CTIME; cache_flags |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME; } if (ia_valid & ATTR_ATIME) { time_flags |= S_ATIME; cache_flags |= NFS_INO_INVALID_ATIME; } inode_update_timestamps(inode, time_flags); NFS_I(inode)->cache_validity &= ~cache_flags; } void nfs_update_delegated_atime(struct inode *inode) { spin_lock(&inode->i_lock); if (nfs_have_delegated_atime(inode)) nfs_update_timestamps(inode, ATTR_ATIME); spin_unlock(&inode->i_lock); } void nfs_update_delegated_mtime_locked(struct inode *inode) { if (nfs_have_delegated_mtime(inode)) nfs_update_timestamps(inode, ATTR_MTIME); } void nfs_update_delegated_mtime(struct inode *inode) { spin_lock(&inode->i_lock); nfs_update_delegated_mtime_locked(inode); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL_GPL(nfs_update_delegated_mtime); #define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE|ATTR_OPEN) int nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); struct nfs_fattr *fattr; int error = 0; nfs_inc_stats(inode, NFSIOS_VFSSETATTR); /* skip mode change if it's just for clearing setuid/setgid */ if (attr->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) attr->ia_valid &= ~ATTR_MODE; if (attr->ia_valid & ATTR_SIZE) { BUG_ON(!S_ISREG(inode->i_mode)); error = inode_newsize_ok(inode, attr->ia_size); if (error) return error; if (attr->ia_size == i_size_read(inode)) attr->ia_valid &= ~ATTR_SIZE; } if (nfs_have_delegated_mtime(inode) && attr->ia_valid & ATTR_MTIME) { spin_lock(&inode->i_lock); nfs_update_timestamps(inode, attr->ia_valid); spin_unlock(&inode->i_lock); attr->ia_valid &= ~(ATTR_MTIME | ATTR_ATIME); } else if (nfs_have_delegated_atime(inode) && attr->ia_valid & ATTR_ATIME && !(attr->ia_valid & ATTR_MTIME)) { nfs_update_delegated_atime(inode); attr->ia_valid &= ~ATTR_ATIME; } /* Optimization: if the end result is no change, don't RPC */ if (((attr->ia_valid & NFS_VALID_ATTRS) & ~(ATTR_FILE|ATTR_OPEN)) == 0) return 0; trace_nfs_setattr_enter(inode); /* Write all dirty data */ if (S_ISREG(inode->i_mode)) nfs_sync_inode(inode); fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode)); if (fattr == NULL) { error = -ENOMEM; goto out; } error = NFS_PROTO(inode)->setattr(dentry, fattr, attr); if (error == 0) error = nfs_refresh_inode(inode, fattr); nfs_free_fattr(fattr); out: trace_nfs_setattr_exit(inode, error); return error; } EXPORT_SYMBOL_GPL(nfs_setattr); /** * nfs_vmtruncate - unmap mappings "freed" by truncate() syscall * @inode: inode of the file used * @offset: file offset to start truncating * * This is a copy of the common vmtruncate, but with the locking * corrected to take into account the fact that NFS requires * inode->i_size to be updated under the inode->i_lock. * Note: must be called with inode->i_lock held! */ static int nfs_vmtruncate(struct inode * inode, loff_t offset) { int err; err = inode_newsize_ok(inode, offset); if (err) goto out; trace_nfs_size_truncate(inode, offset); i_size_write(inode, offset); /* Optimisation */ if (offset == 0) { NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_DATA; nfs_ooo_clear(NFS_I(inode)); } NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE; spin_unlock(&inode->i_lock); truncate_pagecache(inode, offset); nfs_update_delegated_mtime_locked(inode); spin_lock(&inode->i_lock); out: return err; } /** * nfs_setattr_update_inode - Update inode metadata after a setattr call. * @inode: pointer to struct inode * @attr: pointer to struct iattr * @fattr: pointer to struct nfs_fattr * * Note: we do this in the *proc.c in order to ensure that * it works for things like exclusive creates too. */ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, struct nfs_fattr *fattr) { /* Barrier: bump the attribute generation count. */ nfs_fattr_set_barrier(fattr); spin_lock(&inode->i_lock); NFS_I(inode)->attr_gencount = fattr->gencount; if ((attr->ia_valid & ATTR_SIZE) != 0) { if (!nfs_have_delegated_mtime(inode)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME); nfs_set_cache_invalid(inode, NFS_INO_INVALID_BLOCKS); nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC); nfs_vmtruncate(inode, attr->ia_size); } if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) { NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_CTIME; if ((attr->ia_valid & ATTR_KILL_SUID) != 0 && inode->i_mode & S_ISUID) inode->i_mode &= ~S_ISUID; if (setattr_should_drop_sgid(&nop_mnt_idmap, inode)) inode->i_mode &= ~S_ISGID; if ((attr->ia_valid & ATTR_MODE) != 0) { int mode = attr->ia_mode & S_IALLUGO; mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; } if ((attr->ia_valid & ATTR_UID) != 0) inode->i_uid = attr->ia_uid; if ((attr->ia_valid & ATTR_GID) != 0) inode->i_gid = attr->ia_gid; if (fattr->valid & NFS_ATTR_FATTR_CTIME) inode_set_ctime_to_ts(inode, fattr->ctime); else nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME); nfs_set_cache_invalid(inode, NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL); } if (attr->ia_valid & (ATTR_ATIME_SET|ATTR_ATIME)) { NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_ATIME | NFS_INO_INVALID_CTIME); if (fattr->valid & NFS_ATTR_FATTR_ATIME) inode_set_atime_to_ts(inode, fattr->atime); else if (attr->ia_valid & ATTR_ATIME_SET) inode_set_atime_to_ts(inode, attr->ia_atime); else nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME); if (fattr->valid & NFS_ATTR_FATTR_CTIME) inode_set_ctime_to_ts(inode, fattr->ctime); else nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME); } if (attr->ia_valid & (ATTR_MTIME_SET|ATTR_MTIME)) { NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_MTIME | NFS_INO_INVALID_CTIME); if (fattr->valid & NFS_ATTR_FATTR_MTIME) inode_set_mtime_to_ts(inode, fattr->mtime); else if (attr->ia_valid & ATTR_MTIME_SET) inode_set_mtime_to_ts(inode, attr->ia_mtime); else nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME); if (fattr->valid & NFS_ATTR_FATTR_CTIME) inode_set_ctime_to_ts(inode, fattr->ctime); else nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME); } if (fattr->valid) nfs_update_inode(inode, fattr); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL_GPL(nfs_setattr_update_inode); /* * Don't request help from readdirplus if the file is being written to, * or if attribute caching is turned off */ static bool nfs_getattr_readdirplus_enable(const struct inode *inode) { return nfs_server_capable(inode, NFS_CAP_READDIRPLUS) && !nfs_have_writebacks(inode) && NFS_MAXATTRTIMEO(inode) > 5 * HZ; } static void nfs_readdirplus_parent_cache_miss(struct dentry *dentry) { if (!IS_ROOT(dentry)) { struct dentry *parent = dget_parent(dentry); nfs_readdir_record_entry_cache_miss(d_inode(parent)); dput(parent); } } static void nfs_readdirplus_parent_cache_hit(struct dentry *dentry) { if (!IS_ROOT(dentry)) { struct dentry *parent = dget_parent(dentry); nfs_readdir_record_entry_cache_hit(d_inode(parent)); dput(parent); } } static u32 nfs_get_valid_attrmask(struct inode *inode) { unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity); u32 reply_mask = STATX_INO | STATX_TYPE; if (!(cache_validity & NFS_INO_INVALID_ATIME)) reply_mask |= STATX_ATIME; if (!(cache_validity & NFS_INO_INVALID_CTIME)) reply_mask |= STATX_CTIME; if (!(cache_validity & NFS_INO_INVALID_MTIME)) reply_mask |= STATX_MTIME; if (!(cache_validity & NFS_INO_INVALID_SIZE)) reply_mask |= STATX_SIZE; if (!(cache_validity & NFS_INO_INVALID_NLINK)) reply_mask |= STATX_NLINK; if (!(cache_validity & NFS_INO_INVALID_MODE)) reply_mask |= STATX_MODE; if (!(cache_validity & NFS_INO_INVALID_OTHER)) reply_mask |= STATX_UID | STATX_GID; if (!(cache_validity & NFS_INO_INVALID_BLOCKS)) reply_mask |= STATX_BLOCKS; if (!(cache_validity & NFS_INO_INVALID_CHANGE)) reply_mask |= STATX_CHANGE_COOKIE; return reply_mask; } int nfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct nfs_server *server = NFS_SERVER(inode); unsigned long cache_validity; int err = 0; bool force_sync = query_flags & AT_STATX_FORCE_SYNC; bool do_update = false; bool readdirplus_enabled = nfs_getattr_readdirplus_enable(inode); trace_nfs_getattr_enter(inode); request_mask &= STATX_TYPE | STATX_MODE | STATX_NLINK | STATX_UID | STATX_GID | STATX_ATIME | STATX_MTIME | STATX_CTIME | STATX_INO | STATX_SIZE | STATX_BLOCKS | STATX_CHANGE_COOKIE; if ((query_flags & AT_STATX_DONT_SYNC) && !force_sync) { if (readdirplus_enabled) nfs_readdirplus_parent_cache_hit(path->dentry); goto out_no_revalidate; } /* Flush out writes to the server in order to update c/mtime/version. */ if ((request_mask & (STATX_CTIME | STATX_MTIME | STATX_CHANGE_COOKIE)) && S_ISREG(inode->i_mode)) { if (nfs_have_delegated_mtime(inode)) filemap_fdatawrite(inode->i_mapping); else filemap_write_and_wait(inode->i_mapping); } /* * We may force a getattr if the user cares about atime. * * Note that we only have to check the vfsmount flags here: * - NFS always sets S_NOATIME by so checking it would give a * bogus result * - NFS never sets SB_NOATIME or SB_NODIRATIME so there is * no point in checking those. */ if ((path->mnt->mnt_flags & MNT_NOATIME) || ((path->mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))) request_mask &= ~STATX_ATIME; /* Is the user requesting attributes that might need revalidation? */ if (!(request_mask & (STATX_MODE|STATX_NLINK|STATX_ATIME|STATX_CTIME| STATX_MTIME|STATX_UID|STATX_GID| STATX_SIZE|STATX_BLOCKS| STATX_CHANGE_COOKIE))) goto out_no_revalidate; /* Check whether the cached attributes are stale */ do_update |= force_sync || nfs_attribute_cache_expired(inode); cache_validity = READ_ONCE(NFS_I(inode)->cache_validity); do_update |= cache_validity & NFS_INO_INVALID_CHANGE; if (request_mask & STATX_ATIME) do_update |= cache_validity & NFS_INO_INVALID_ATIME; if (request_mask & STATX_CTIME) do_update |= cache_validity & NFS_INO_INVALID_CTIME; if (request_mask & STATX_MTIME) do_update |= cache_validity & NFS_INO_INVALID_MTIME; if (request_mask & STATX_SIZE) do_update |= cache_validity & NFS_INO_INVALID_SIZE; if (request_mask & STATX_NLINK) do_update |= cache_validity & NFS_INO_INVALID_NLINK; if (request_mask & STATX_MODE) do_update |= cache_validity & NFS_INO_INVALID_MODE; if (request_mask & (STATX_UID | STATX_GID)) do_update |= cache_validity & NFS_INO_INVALID_OTHER; if (request_mask & STATX_BLOCKS) do_update |= cache_validity & NFS_INO_INVALID_BLOCKS; if (do_update) { if (readdirplus_enabled) nfs_readdirplus_parent_cache_miss(path->dentry); err = __nfs_revalidate_inode(server, inode); if (err) goto out; } else if (readdirplus_enabled) nfs_readdirplus_parent_cache_hit(path->dentry); out_no_revalidate: /* Only return attributes that were revalidated. */ stat->result_mask = nfs_get_valid_attrmask(inode) | request_mask; generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); stat->change_cookie = inode_peek_iversion_raw(inode); stat->attributes_mask |= STATX_ATTR_CHANGE_MONOTONIC; if (server->change_attr_type != NFS4_CHANGE_TYPE_IS_UNDEFINED) stat->attributes |= STATX_ATTR_CHANGE_MONOTONIC; if (S_ISDIR(inode->i_mode)) stat->blksize = NFS_SERVER(inode)->dtsize; out: trace_nfs_getattr_exit(inode, err); return err; } EXPORT_SYMBOL_GPL(nfs_getattr); static void nfs_init_lock_context(struct nfs_lock_context *l_ctx) { refcount_set(&l_ctx->count, 1); l_ctx->lockowner = current->files; INIT_LIST_HEAD(&l_ctx->list); atomic_set(&l_ctx->io_count, 0); } static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx) { struct nfs_lock_context *pos; list_for_each_entry_rcu(pos, &ctx->lock_context.list, list) { if (pos->lockowner != current->files) continue; if (refcount_inc_not_zero(&pos->count)) return pos; } return NULL; } struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx) { struct nfs_lock_context *res, *new = NULL; struct inode *inode = d_inode(ctx->dentry); rcu_read_lock(); res = __nfs_find_lock_context(ctx); rcu_read_unlock(); if (res == NULL) { new = kmalloc(sizeof(*new), GFP_KERNEL_ACCOUNT); if (new == NULL) return ERR_PTR(-ENOMEM); nfs_init_lock_context(new); spin_lock(&inode->i_lock); res = __nfs_find_lock_context(ctx); if (res == NULL) { new->open_context = get_nfs_open_context(ctx); if (new->open_context) { list_add_tail_rcu(&new->list, &ctx->lock_context.list); res = new; new = NULL; } else res = ERR_PTR(-EBADF); } spin_unlock(&inode->i_lock); kfree(new); } return res; } EXPORT_SYMBOL_GPL(nfs_get_lock_context); void nfs_put_lock_context(struct nfs_lock_context *l_ctx) { struct nfs_open_context *ctx = l_ctx->open_context; struct inode *inode = d_inode(ctx->dentry); if (!refcount_dec_and_lock(&l_ctx->count, &inode->i_lock)) return; list_del_rcu(&l_ctx->list); spin_unlock(&inode->i_lock); put_nfs_open_context(ctx); kfree_rcu(l_ctx, rcu_head); } EXPORT_SYMBOL_GPL(nfs_put_lock_context); /** * nfs_close_context - Common close_context() routine NFSv2/v3 * @ctx: pointer to context * @is_sync: is this a synchronous close * * Ensure that the attributes are up to date if we're mounted * with close-to-open semantics and we have cached data that will * need to be revalidated on open. */ void nfs_close_context(struct nfs_open_context *ctx, int is_sync) { struct nfs_inode *nfsi; struct inode *inode; if (!(ctx->mode & FMODE_WRITE)) return; if (!is_sync) return; inode = d_inode(ctx->dentry); if (nfs_have_read_or_write_delegation(inode)) return; nfsi = NFS_I(inode); if (inode->i_mapping->nrpages == 0) return; if (nfsi->cache_validity & NFS_INO_INVALID_DATA) return; if (!list_empty(&nfsi->open_files)) return; if (NFS_SERVER(inode)->flags & NFS_MOUNT_NOCTO) return; nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE); } EXPORT_SYMBOL_GPL(nfs_close_context); struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, fmode_t f_mode, struct file *filp) { struct nfs_open_context *ctx; ctx = kmalloc(sizeof(*ctx), GFP_KERNEL_ACCOUNT); if (!ctx) return ERR_PTR(-ENOMEM); nfs_sb_active(dentry->d_sb); ctx->dentry = dget(dentry); if (filp) ctx->cred = get_cred(filp->f_cred); else ctx->cred = get_current_cred(); rcu_assign_pointer(ctx->ll_cred, NULL); ctx->state = NULL; ctx->mode = f_mode; ctx->flags = 0; ctx->error = 0; ctx->flock_owner = (fl_owner_t)filp; nfs_init_lock_context(&ctx->lock_context); ctx->lock_context.open_context = ctx; INIT_LIST_HEAD(&ctx->list); ctx->mdsthreshold = NULL; nfs_localio_file_init(&ctx->nfl); return ctx; } EXPORT_SYMBOL_GPL(alloc_nfs_open_context); struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx) { if (ctx != NULL && refcount_inc_not_zero(&ctx->lock_context.count)) return ctx; return NULL; } EXPORT_SYMBOL_GPL(get_nfs_open_context); static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) { struct inode *inode = d_inode(ctx->dentry); struct super_block *sb = ctx->dentry->d_sb; if (!refcount_dec_and_test(&ctx->lock_context.count)) return; if (!list_empty(&ctx->list)) { spin_lock(&inode->i_lock); list_del_rcu(&ctx->list); spin_unlock(&inode->i_lock); } if (inode != NULL) NFS_PROTO(inode)->close_context(ctx, is_sync); put_cred(ctx->cred); dput(ctx->dentry); nfs_sb_deactive(sb); put_rpccred(rcu_dereference_protected(ctx->ll_cred, 1)); kfree(ctx->mdsthreshold); nfs_close_local_fh(&ctx->nfl); kfree_rcu(ctx, rcu_head); } void put_nfs_open_context(struct nfs_open_context *ctx) { __put_nfs_open_context(ctx, 0); } EXPORT_SYMBOL_GPL(put_nfs_open_context); static void put_nfs_open_context_sync(struct nfs_open_context *ctx) { __put_nfs_open_context(ctx, 1); } /* * Ensure that mmap has a recent RPC credential for use when writing out * shared pages */ void nfs_inode_attach_open_context(struct nfs_open_context *ctx) { struct inode *inode = d_inode(ctx->dentry); struct nfs_inode *nfsi = NFS_I(inode); spin_lock(&inode->i_lock); if (list_empty(&nfsi->open_files) && nfs_ooo_test(nfsi)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA | NFS_INO_REVAL_FORCED); list_add_tail_rcu(&ctx->list, &nfsi->open_files); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context); void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx) { filp->private_data = get_nfs_open_context(ctx); set_bit(NFS_CONTEXT_FILE_OPEN, &ctx->flags); if (list_empty(&ctx->list)) nfs_inode_attach_open_context(ctx); } EXPORT_SYMBOL_GPL(nfs_file_set_open_context); /* * Given an inode, search for an open context with the desired characteristics */ struct nfs_open_context *nfs_find_open_context(struct inode *inode, const struct cred *cred, fmode_t mode) { struct nfs_inode *nfsi = NFS_I(inode); struct nfs_open_context *pos, *ctx = NULL; rcu_read_lock(); list_for_each_entry_rcu(pos, &nfsi->open_files, list) { if (cred != NULL && cred_fscmp(pos->cred, cred) != 0) continue; if ((pos->mode & (FMODE_READ|FMODE_WRITE)) != mode) continue; if (!test_bit(NFS_CONTEXT_FILE_OPEN, &pos->flags)) continue; ctx = get_nfs_open_context(pos); if (ctx) break; } rcu_read_unlock(); return ctx; } void nfs_file_clear_open_context(struct file *filp) { struct nfs_open_context *ctx = nfs_file_open_context(filp); if (ctx) { struct inode *inode = d_inode(ctx->dentry); clear_bit(NFS_CONTEXT_FILE_OPEN, &ctx->flags); /* * We fatal error on write before. Try to writeback * every page again. */ if (ctx->error < 0) invalidate_inode_pages2(inode->i_mapping); filp->private_data = NULL; put_nfs_open_context_sync(ctx); } } /* * These allocate and release file read/write context information. */ int nfs_open(struct inode *inode, struct file *filp) { struct nfs_open_context *ctx; ctx = alloc_nfs_open_context(file_dentry(filp), flags_to_mode(filp->f_flags), filp); if (IS_ERR(ctx)) return PTR_ERR(ctx); nfs_file_set_open_context(filp, ctx); put_nfs_open_context(ctx); nfs_fscache_open_file(inode, filp); return 0; } /* * This function is called whenever some part of NFS notices that * the cached attributes have to be refreshed. */ int __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) { int status = -ESTALE; struct nfs_fattr *fattr = NULL; struct nfs_inode *nfsi = NFS_I(inode); dfprintk(PAGECACHE, "NFS: revalidating (%s/%Lu)\n", inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode)); trace_nfs_revalidate_inode_enter(inode); if (is_bad_inode(inode)) goto out; if (NFS_STALE(inode)) goto out; /* pNFS: Attributes aren't updated until we layoutcommit */ if (S_ISREG(inode->i_mode)) { status = pnfs_sync_inode(inode, false); if (status) goto out; } status = -ENOMEM; fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode)); if (fattr == NULL) goto out; nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr, inode); if (status != 0) { dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) getattr failed, error=%d\n", inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode), status); switch (status) { case -ETIMEDOUT: /* A soft timeout occurred. Use cached information? */ if (server->flags & NFS_MOUNT_SOFTREVAL) status = 0; break; case -ESTALE: if (!S_ISDIR(inode->i_mode)) nfs_set_inode_stale(inode); else nfs_zap_caches(inode); } goto out; } status = nfs_refresh_inode(inode, fattr); if (status) { dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) refresh failed, error=%d\n", inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode), status); goto out; } if (nfsi->cache_validity & NFS_INO_INVALID_ACL) nfs_zap_acl_cache(inode); nfs_setsecurity(inode, fattr); dfprintk(PAGECACHE, "NFS: (%s/%Lu) revalidation complete\n", inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode)); out: nfs_free_fattr(fattr); trace_nfs_revalidate_inode_exit(inode, status); return status; } int nfs_attribute_cache_expired(struct inode *inode) { if (nfs_have_delegated_attributes(inode)) return 0; return nfs_attribute_timeout(inode); } /** * nfs_revalidate_inode - Revalidate the inode attributes * @inode: pointer to inode struct * @flags: cache flags to check * * Updates inode attribute information by retrieving the data from the server. */ int nfs_revalidate_inode(struct inode *inode, unsigned long flags) { if (!nfs_check_cache_invalid(inode, flags)) return NFS_STALE(inode) ? -ESTALE : 0; return __nfs_revalidate_inode(NFS_SERVER(inode), inode); } EXPORT_SYMBOL_GPL(nfs_revalidate_inode); static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping) { int ret; nfs_fscache_invalidate(inode, 0); if (mapping->nrpages != 0) { if (S_ISREG(inode->i_mode)) { ret = nfs_sync_mapping(mapping); if (ret < 0) return ret; } ret = invalidate_inode_pages2(mapping); if (ret < 0) return ret; } nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); dfprintk(PAGECACHE, "NFS: (%s/%Lu) data cache invalidated\n", inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode)); return 0; } /** * nfs_clear_invalid_mapping - Conditionally clear a mapping * @mapping: pointer to mapping * * If the NFS_INO_INVALID_DATA inode flag is set, clear the mapping. */ int nfs_clear_invalid_mapping(struct address_space *mapping) { struct inode *inode = mapping->host; struct nfs_inode *nfsi = NFS_I(inode); unsigned long *bitlock = &nfsi->flags; int ret = 0; /* * We must clear NFS_INO_INVALID_DATA first to ensure that * invalidations that come in while we're shooting down the mappings * are respected. But, that leaves a race window where one revalidator * can clear the flag, and then another checks it before the mapping * gets invalidated. Fix that by serializing access to this part of * the function. * * At the same time, we need to allow other tasks to see whether we * might be in the middle of invalidating the pages, so we only set * the bit lock here if it looks like we're going to be doing that. */ for (;;) { ret = wait_on_bit_action(bitlock, NFS_INO_INVALIDATING, nfs_wait_bit_killable, TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); if (ret) goto out; smp_rmb(); /* pairs with smp_wmb() below */ if (test_bit(NFS_INO_INVALIDATING, bitlock)) continue; /* pairs with nfs_set_cache_invalid()'s smp_store_release() */ if (!(smp_load_acquire(&nfsi->cache_validity) & NFS_INO_INVALID_DATA)) goto out; /* Slow-path that double-checks with spinlock held */ spin_lock(&inode->i_lock); if (test_bit(NFS_INO_INVALIDATING, bitlock)) { spin_unlock(&inode->i_lock); continue; } if (nfsi->cache_validity & NFS_INO_INVALID_DATA) break; spin_unlock(&inode->i_lock); goto out; } set_bit(NFS_INO_INVALIDATING, bitlock); smp_wmb(); nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; nfs_ooo_clear(nfsi); spin_unlock(&inode->i_lock); trace_nfs_invalidate_mapping_enter(inode); ret = nfs_invalidate_mapping(inode, mapping); trace_nfs_invalidate_mapping_exit(inode, ret); clear_bit_unlock(NFS_INO_INVALIDATING, bitlock); smp_mb__after_atomic(); wake_up_bit(bitlock, NFS_INO_INVALIDATING); out: return ret; } bool nfs_mapping_need_revalidate_inode(struct inode *inode) { return nfs_check_cache_invalid(inode, NFS_INO_INVALID_CHANGE) || NFS_STALE(inode); } int nfs_revalidate_mapping_rcu(struct inode *inode) { struct nfs_inode *nfsi = NFS_I(inode); unsigned long *bitlock = &nfsi->flags; int ret = 0; if (IS_SWAPFILE(inode)) goto out; if (nfs_mapping_need_revalidate_inode(inode)) { ret = -ECHILD; goto out; } spin_lock(&inode->i_lock); if (test_bit(NFS_INO_INVALIDATING, bitlock) || (nfsi->cache_validity & NFS_INO_INVALID_DATA)) ret = -ECHILD; spin_unlock(&inode->i_lock); out: return ret; } /** * nfs_revalidate_mapping - Revalidate the pagecache * @inode: pointer to host inode * @mapping: pointer to mapping */ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) { /* swapfiles are not supposed to be shared. */ if (IS_SWAPFILE(inode)) return 0; if (nfs_mapping_need_revalidate_inode(inode)) { int ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode); if (ret < 0) return ret; } return nfs_clear_invalid_mapping(mapping); } static bool nfs_file_has_writers(struct nfs_inode *nfsi) { struct inode *inode = &nfsi->vfs_inode; if (!S_ISREG(inode->i_mode)) return false; if (list_empty(&nfsi->open_files)) return false; return inode_is_open_for_write(inode); } static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi) { return nfs_file_has_writers(nfsi) && nfs_file_io_is_buffered(nfsi); } static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) { struct timespec64 ts; if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE) && (fattr->valid & NFS_ATTR_FATTR_CHANGE) && inode_eq_iversion_raw(inode, fattr->pre_change_attr)) { inode_set_iversion_raw(inode, fattr->change_attr); if (S_ISDIR(inode->i_mode)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); else if (nfs_server_capable(inode, NFS_CAP_XATTR)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_XATTR); } /* If we have atomic WCC data, we may update some attributes */ ts = inode_get_ctime(inode); if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME) && (fattr->valid & NFS_ATTR_FATTR_CTIME) && timespec64_equal(&ts, &fattr->pre_ctime)) { inode_set_ctime_to_ts(inode, fattr->ctime); } ts = inode_get_mtime(inode); if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME) && (fattr->valid & NFS_ATTR_FATTR_MTIME) && timespec64_equal(&ts, &fattr->pre_mtime)) { inode_set_mtime_to_ts(inode, fattr->mtime); } if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE) && (fattr->valid & NFS_ATTR_FATTR_SIZE) && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) && !nfs_have_writebacks(inode)) { trace_nfs_size_wcc(inode, fattr->size); i_size_write(inode, nfs_size_to_loff_t(fattr->size)); } } /** * nfs_check_inode_attributes - verify consistency of the inode attribute cache * @inode: pointer to inode * @fattr: updated attributes * * Verifies the attribute cache. If we have just changed the attributes, * so that fattr carries weak cache consistency data, then it may * also update the ctime/mtime/change_attribute. */ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fattr) { struct nfs_inode *nfsi = NFS_I(inode); loff_t cur_size, new_isize; unsigned long invalid = 0; struct timespec64 ts; if (nfs_have_delegated_attributes(inode)) return 0; if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) { /* Only a mounted-on-fileid? Just exit */ if (fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) return 0; /* Has the inode gone and changed behind our back? */ } else if (nfsi->fileid != fattr->fileid) { /* Is this perhaps the mounted-on fileid? */ if ((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) && nfsi->fileid == fattr->mounted_on_fileid) return 0; return -ESTALE; } if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && inode_wrong_type(inode, fattr->mode)) return -ESTALE; if (!nfs_file_has_buffered_writers(nfsi)) { /* Verify a few of the more important attributes */ if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && !inode_eq_iversion_raw(inode, fattr->change_attr)) invalid |= NFS_INO_INVALID_CHANGE; ts = inode_get_mtime(inode); if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec64_equal(&ts, &fattr->mtime)) invalid |= NFS_INO_INVALID_MTIME; ts = inode_get_ctime(inode); if ((fattr->valid & NFS_ATTR_FATTR_CTIME) && !timespec64_equal(&ts, &fattr->ctime)) invalid |= NFS_INO_INVALID_CTIME; if (fattr->valid & NFS_ATTR_FATTR_SIZE) { cur_size = i_size_read(inode); new_isize = nfs_size_to_loff_t(fattr->size); if (cur_size != new_isize) invalid |= NFS_INO_INVALID_SIZE; } } /* Have any file permissions changed? */ if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) invalid |= NFS_INO_INVALID_MODE; if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && !uid_eq(inode->i_uid, fattr->uid)) invalid |= NFS_INO_INVALID_OTHER; if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && !gid_eq(inode->i_gid, fattr->gid)) invalid |= NFS_INO_INVALID_OTHER; /* Has the link count changed? */ if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink) invalid |= NFS_INO_INVALID_NLINK; ts = inode_get_atime(inode); if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec64_equal(&ts, &fattr->atime)) invalid |= NFS_INO_INVALID_ATIME; if (invalid != 0) nfs_set_cache_invalid(inode, invalid); nfsi->read_cache_jiffies = fattr->time_start; return 0; } static atomic_long_t nfs_attr_generation_counter; static unsigned long nfs_read_attr_generation_counter(void) { return atomic_long_read(&nfs_attr_generation_counter); } unsigned long nfs_inc_attr_generation_counter(void) { return atomic_long_inc_return(&nfs_attr_generation_counter); } EXPORT_SYMBOL_GPL(nfs_inc_attr_generation_counter); void nfs_fattr_init(struct nfs_fattr *fattr) { fattr->valid = 0; fattr->time_start = jiffies; fattr->gencount = nfs_inc_attr_generation_counter(); fattr->owner_name = NULL; fattr->group_name = NULL; fattr->mdsthreshold = NULL; } EXPORT_SYMBOL_GPL(nfs_fattr_init); /** * nfs_fattr_set_barrier * @fattr: attributes * * Used to set a barrier after an attribute was updated. This * barrier ensures that older attributes from RPC calls that may * have raced with our update cannot clobber these new values. * Note that you are still responsible for ensuring that other * operations which change the attribute on the server do not * collide. */ void nfs_fattr_set_barrier(struct nfs_fattr *fattr) { fattr->gencount = nfs_inc_attr_generation_counter(); } struct nfs_fattr *nfs_alloc_fattr(void) { struct nfs_fattr *fattr; fattr = kmalloc(sizeof(*fattr), GFP_KERNEL); if (fattr != NULL) { nfs_fattr_init(fattr); fattr->label = NULL; } return fattr; } EXPORT_SYMBOL_GPL(nfs_alloc_fattr); struct nfs_fattr *nfs_alloc_fattr_with_label(struct nfs_server *server) { struct nfs_fattr *fattr = nfs_alloc_fattr(); if (!fattr) return NULL; fattr->label = nfs4_label_alloc(server, GFP_KERNEL); if (IS_ERR(fattr->label)) { kfree(fattr); return NULL; } return fattr; } EXPORT_SYMBOL_GPL(nfs_alloc_fattr_with_label); struct nfs_fh *nfs_alloc_fhandle(void) { struct nfs_fh *fh; fh = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL); if (fh != NULL) fh->size = 0; return fh; } EXPORT_SYMBOL_GPL(nfs_alloc_fhandle); #ifdef NFS_DEBUG /* * _nfs_display_fhandle_hash - calculate the crc32 hash for the filehandle * in the same way that wireshark does * * @fh: file handle * * For debugging only. */ u32 _nfs_display_fhandle_hash(const struct nfs_fh *fh) { /* wireshark uses 32-bit AUTODIN crc and does a bitwise * not on the result */ return nfs_fhandle_hash(fh); } EXPORT_SYMBOL_GPL(_nfs_display_fhandle_hash); /* * _nfs_display_fhandle - display an NFS file handle on the console * * @fh: file handle to display * @caption: display caption * * For debugging only. */ void _nfs_display_fhandle(const struct nfs_fh *fh, const char *caption) { unsigned short i; if (fh == NULL || fh->size == 0) { printk(KERN_DEFAULT "%s at %p is empty\n", caption, fh); return; } printk(KERN_DEFAULT "%s at %p is %u bytes, crc: 0x%08x:\n", caption, fh, fh->size, _nfs_display_fhandle_hash(fh)); for (i = 0; i < fh->size; i += 16) { __be32 *pos = (__be32 *)&fh->data[i]; switch ((fh->size - i - 1) >> 2) { case 0: printk(KERN_DEFAULT " %08x\n", be32_to_cpup(pos)); break; case 1: printk(KERN_DEFAULT " %08x %08x\n", be32_to_cpup(pos), be32_to_cpup(pos + 1)); break; case 2: printk(KERN_DEFAULT " %08x %08x %08x\n", be32_to_cpup(pos), be32_to_cpup(pos + 1), be32_to_cpup(pos + 2)); break; default: printk(KERN_DEFAULT " %08x %08x %08x %08x\n", be32_to_cpup(pos), be32_to_cpup(pos + 1), be32_to_cpup(pos + 2), be32_to_cpup(pos + 3)); } } } EXPORT_SYMBOL_GPL(_nfs_display_fhandle); #endif /** * nfs_inode_attrs_cmp_generic - compare attributes * @fattr: attributes * @inode: pointer to inode * * Attempt to divine whether or not an RPC call reply carrying stale * attributes got scheduled after another call carrying updated ones. * Note also the check for wraparound of 'attr_gencount' * * The function returns '1' if it thinks the attributes in @fattr are * more recent than the ones cached in @inode. Otherwise it returns * the value '0'. */ static int nfs_inode_attrs_cmp_generic(const struct nfs_fattr *fattr, const struct inode *inode) { unsigned long attr_gencount = NFS_I(inode)->attr_gencount; return (long)(fattr->gencount - attr_gencount) > 0 || (long)(attr_gencount - nfs_read_attr_generation_counter()) > 0; } /** * nfs_inode_attrs_cmp_monotonic - compare attributes * @fattr: attributes * @inode: pointer to inode * * Attempt to divine whether or not an RPC call reply carrying stale * attributes got scheduled after another call carrying updated ones. * * We assume that the server observes monotonic semantics for * the change attribute, so a larger value means that the attributes in * @fattr are more recent, in which case the function returns the * value '1'. * A return value of '0' indicates no measurable change * A return value of '-1' means that the attributes in @inode are * more recent. */ static int nfs_inode_attrs_cmp_monotonic(const struct nfs_fattr *fattr, const struct inode *inode) { s64 diff = fattr->change_attr - inode_peek_iversion_raw(inode); if (diff > 0) return 1; return diff == 0 ? 0 : -1; } /** * nfs_inode_attrs_cmp_strict_monotonic - compare attributes * @fattr: attributes * @inode: pointer to inode * * Attempt to divine whether or not an RPC call reply carrying stale * attributes got scheduled after another call carrying updated ones. * * We assume that the server observes strictly monotonic semantics for * the change attribute, so a larger value means that the attributes in * @fattr are more recent, in which case the function returns the * value '1'. * A return value of '-1' means that the attributes in @inode are * more recent or unchanged. */ static int nfs_inode_attrs_cmp_strict_monotonic(const struct nfs_fattr *fattr, const struct inode *inode) { return nfs_inode_attrs_cmp_monotonic(fattr, inode) > 0 ? 1 : -1; } /** * nfs_inode_attrs_cmp - compare attributes * @fattr: attributes * @inode: pointer to inode * * This function returns '1' if it thinks the attributes in @fattr are * more recent than the ones cached in @inode. It returns '-1' if * the attributes in @inode are more recent than the ones in @fattr, * and it returns 0 if not sure. */ static int nfs_inode_attrs_cmp(const struct nfs_fattr *fattr, const struct inode *inode) { if (nfs_inode_attrs_cmp_generic(fattr, inode) > 0) return 1; switch (NFS_SERVER(inode)->change_attr_type) { case NFS4_CHANGE_TYPE_IS_UNDEFINED: break; case NFS4_CHANGE_TYPE_IS_TIME_METADATA: if (!(fattr->valid & NFS_ATTR_FATTR_CHANGE)) break; return nfs_inode_attrs_cmp_monotonic(fattr, inode); default: if (!(fattr->valid & NFS_ATTR_FATTR_CHANGE)) break; return nfs_inode_attrs_cmp_strict_monotonic(fattr, inode); } return 0; } /** * nfs_inode_finish_partial_attr_update - complete a previous inode update * @fattr: attributes * @inode: pointer to inode * * Returns '1' if the last attribute update left the inode cached * attributes in a partially unrevalidated state, and @fattr * matches the change attribute of that partial update. * Otherwise returns '0'. */ static int nfs_inode_finish_partial_attr_update(const struct nfs_fattr *fattr, const struct inode *inode) { const unsigned long check_valid = NFS_INO_INVALID_ATIME | NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME | NFS_INO_INVALID_SIZE | NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_OTHER | NFS_INO_INVALID_NLINK; unsigned long cache_validity = NFS_I(inode)->cache_validity; enum nfs4_change_attr_type ctype = NFS_SERVER(inode)->change_attr_type; if (ctype != NFS4_CHANGE_TYPE_IS_UNDEFINED && !(cache_validity & NFS_INO_INVALID_CHANGE) && (cache_validity & check_valid) != 0 && (fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && nfs_inode_attrs_cmp_monotonic(fattr, inode) == 0) return 1; return 0; } static void nfs_ooo_merge(struct nfs_inode *nfsi, u64 start, u64 end) { int i, cnt; if (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER) /* No point merging anything */ return; if (!nfsi->ooo) { nfsi->ooo = kmalloc(sizeof(*nfsi->ooo), GFP_ATOMIC); if (!nfsi->ooo) { nfsi->cache_validity |= NFS_INO_DATA_INVAL_DEFER; return; } nfsi->ooo->cnt = 0; } /* add this range, merging if possible */ cnt = nfsi->ooo->cnt; for (i = 0; i < cnt; i++) { if (end == nfsi->ooo->gap[i].start) end = nfsi->ooo->gap[i].end; else if (start == nfsi->ooo->gap[i].end) start = nfsi->ooo->gap[i].start; else continue; /* Remove 'i' from table and loop to insert the new range */ cnt -= 1; nfsi->ooo->gap[i] = nfsi->ooo->gap[cnt]; i = -1; } if (start != end) { if (cnt >= ARRAY_SIZE(nfsi->ooo->gap)) { nfsi->cache_validity |= NFS_INO_DATA_INVAL_DEFER; kfree(nfsi->ooo); nfsi->ooo = NULL; return; } nfsi->ooo->gap[cnt].start = start; nfsi->ooo->gap[cnt].end = end; cnt += 1; } nfsi->ooo->cnt = cnt; } static void nfs_ooo_record(struct nfs_inode *nfsi, struct nfs_fattr *fattr) { /* This reply was out-of-order, so record in the * pre/post change id, possibly cancelling * gaps created when iversion was jumpped forward. */ if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) && (fattr->valid & NFS_ATTR_FATTR_PRECHANGE)) nfs_ooo_merge(nfsi, fattr->change_attr, fattr->pre_change_attr); } static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr) { int attr_cmp = nfs_inode_attrs_cmp(fattr, inode); int ret = 0; trace_nfs_refresh_inode_enter(inode); if (attr_cmp > 0 || nfs_inode_finish_partial_attr_update(fattr, inode)) ret = nfs_update_inode(inode, fattr); else { nfs_ooo_record(NFS_I(inode), fattr); if (attr_cmp == 0) ret = nfs_check_inode_attributes(inode, fattr); } trace_nfs_refresh_inode_exit(inode, ret); return ret; } /** * nfs_refresh_inode - try to update the inode attribute cache * @inode: pointer to inode * @fattr: updated attributes * * Check that an RPC call that returned attributes has not overlapped with * other recent updates of the inode metadata, then decide whether it is * safe to do a full update of the inode attributes, or whether just to * call nfs_check_inode_attributes. */ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) { int status; if ((fattr->valid & NFS_ATTR_FATTR) == 0) return 0; spin_lock(&inode->i_lock); status = nfs_refresh_inode_locked(inode, fattr); spin_unlock(&inode->i_lock); return status; } EXPORT_SYMBOL_GPL(nfs_refresh_inode); static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr, unsigned int invalid) { if (S_ISDIR(inode->i_mode)) invalid |= NFS_INO_INVALID_DATA; nfs_set_cache_invalid(inode, invalid); if ((fattr->valid & NFS_ATTR_FATTR) == 0) return 0; return nfs_refresh_inode_locked(inode, fattr); } /** * nfs_post_op_update_inode - try to update the inode attribute cache * @inode: pointer to inode * @fattr: updated attributes * * After an operation that has changed the inode metadata, mark the * attribute cache as being invalid, then try to update it. * * NB: if the server didn't return any post op attributes, this * function will force the retrieval of attributes before the next * NFS request. Thus it should be used only for operations that * are expected to change one or more attributes, to avoid * unnecessary NFS requests and trips through nfs_update_inode(). */ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) { int status; spin_lock(&inode->i_lock); nfs_fattr_set_barrier(fattr); status = nfs_post_op_update_inode_locked(inode, fattr, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME | NFS_INO_REVAL_FORCED); spin_unlock(&inode->i_lock); return status; } EXPORT_SYMBOL_GPL(nfs_post_op_update_inode); /** * nfs_post_op_update_inode_force_wcc_locked - update the inode attribute cache * @inode: pointer to inode * @fattr: updated attributes * * After an operation that has changed the inode metadata, mark the * attribute cache as being invalid, then try to update it. Fake up * weak cache consistency data, if none exist. * * This function is mainly designed to be used by the ->write_done() functions. */ int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fattr *fattr) { int attr_cmp = nfs_inode_attrs_cmp(fattr, inode); int status; /* Don't do a WCC update if these attributes are already stale */ if (attr_cmp < 0) return 0; if ((fattr->valid & NFS_ATTR_FATTR) == 0 || !attr_cmp) { /* Record the pre/post change info before clearing PRECHANGE */ nfs_ooo_record(NFS_I(inode), fattr); fattr->valid &= ~(NFS_ATTR_FATTR_PRECHANGE | NFS_ATTR_FATTR_PRESIZE | NFS_ATTR_FATTR_PREMTIME | NFS_ATTR_FATTR_PRECTIME); goto out_noforce; } if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && (fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) { fattr->pre_change_attr = inode_peek_iversion_raw(inode); fattr->valid |= NFS_ATTR_FATTR_PRECHANGE; } if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 && (fattr->valid & NFS_ATTR_FATTR_PRECTIME) == 0) { fattr->pre_ctime = inode_get_ctime(inode); fattr->valid |= NFS_ATTR_FATTR_PRECTIME; } if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 && (fattr->valid & NFS_ATTR_FATTR_PREMTIME) == 0) { fattr->pre_mtime = inode_get_mtime(inode); fattr->valid |= NFS_ATTR_FATTR_PREMTIME; } if ((fattr->valid & NFS_ATTR_FATTR_SIZE) != 0 && (fattr->valid & NFS_ATTR_FATTR_PRESIZE) == 0) { fattr->pre_size = i_size_read(inode); fattr->valid |= NFS_ATTR_FATTR_PRESIZE; } out_noforce: status = nfs_post_op_update_inode_locked(inode, fattr, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME | NFS_INO_INVALID_BLOCKS); return status; } /** * nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache * @inode: pointer to inode * @fattr: updated attributes * * After an operation that has changed the inode metadata, mark the * attribute cache as being invalid, then try to update it. Fake up * weak cache consistency data, if none exist. * * This function is mainly designed to be used by the ->write_done() functions. */ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr) { int status; spin_lock(&inode->i_lock); nfs_fattr_set_barrier(fattr); status = nfs_post_op_update_inode_force_wcc_locked(inode, fattr); spin_unlock(&inode->i_lock); return status; } EXPORT_SYMBOL_GPL(nfs_post_op_update_inode_force_wcc); /* * Many nfs protocol calls return the new file attributes after * an operation. Here we update the inode to reflect the state * of the server's inode. * * This is a bit tricky because we have to make sure all dirty pages * have been sent off to the server before calling invalidate_inode_pages. * To make sure no other process adds more write requests while we try * our best to flush them, we make them sleep during the attribute refresh. * * A very similar scenario holds for the dir cache. */ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) { struct nfs_server *server = NFS_SERVER(inode); struct nfs_inode *nfsi = NFS_I(inode); loff_t cur_isize, new_isize; u64 fattr_supported = server->fattr_valid; unsigned long invalid = 0; unsigned long now = jiffies; unsigned long save_cache_validity; bool have_writers = nfs_file_has_buffered_writers(nfsi); bool cache_revalidated = true; bool attr_changed = false; bool have_delegation; dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n", __func__, inode->i_sb->s_id, inode->i_ino, nfs_display_fhandle_hash(NFS_FH(inode)), atomic_read(&inode->i_count), fattr->valid); if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) { /* Only a mounted-on-fileid? Just exit */ if (fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) return 0; /* Has the inode gone and changed behind our back? */ } else if (nfsi->fileid != fattr->fileid) { /* Is this perhaps the mounted-on fileid? */ if ((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) && nfsi->fileid == fattr->mounted_on_fileid) return 0; printk(KERN_ERR "NFS: server %s error: fileid changed\n" "fsid %s: expected fileid 0x%Lx, got 0x%Lx\n", NFS_SERVER(inode)->nfs_client->cl_hostname, inode->i_sb->s_id, (long long)nfsi->fileid, (long long)fattr->fileid); goto out_err; } /* * Make sure the inode's type hasn't changed. */ if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && inode_wrong_type(inode, fattr->mode)) { /* * Big trouble! The inode has become a different object. */ printk(KERN_DEBUG "NFS: %s: inode %lu mode changed, %07o to %07o\n", __func__, inode->i_ino, inode->i_mode, fattr->mode); goto out_err; } /* Update the fsid? */ if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) && !nfs_fsid_equal(&server->fsid, &fattr->fsid) && !IS_AUTOMOUNT(inode)) server->fsid = fattr->fsid; /* Save the delegation state before clearing cache_validity */ have_delegation = nfs_have_delegated_attributes(inode); /* * Update the read time so we don't revalidate too often. */ nfsi->read_cache_jiffies = fattr->time_start; /* Fix up any delegated attributes in the struct nfs_fattr */ nfs_fattr_fixup_delegated(inode, fattr); save_cache_validity = nfsi->cache_validity; nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ATIME | NFS_INO_REVAL_FORCED | NFS_INO_INVALID_BLOCKS); /* Do atomic weak cache consistency updates */ nfs_wcc_update_inode(inode, fattr); if (pnfs_layoutcommit_outstanding(inode)) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME | NFS_INO_INVALID_SIZE | NFS_INO_INVALID_BLOCKS); cache_revalidated = false; } /* More cache consistency checks */ if (fattr->valid & NFS_ATTR_FATTR_CHANGE) { if (!have_writers && nfsi->ooo && nfsi->ooo->cnt == 1 && nfsi->ooo->gap[0].end == inode_peek_iversion_raw(inode)) { /* There is one remaining gap that hasn't been * merged into iversion - do that now. */ inode_set_iversion_raw(inode, nfsi->ooo->gap[0].start); kfree(nfsi->ooo); nfsi->ooo = NULL; } if (!inode_eq_iversion_raw(inode, fattr->change_attr)) { /* Could it be a race with writeback? */ if (!(have_writers || have_delegation)) { invalid |= NFS_INO_INVALID_DATA | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | NFS_INO_INVALID_XATTR; /* Force revalidate of all attributes */ save_cache_validity |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME | NFS_INO_INVALID_SIZE | NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_NLINK | NFS_INO_INVALID_MODE | NFS_INO_INVALID_OTHER; if (S_ISDIR(inode->i_mode)) nfs_force_lookup_revalidate(inode); attr_changed = true; dprintk("NFS: change_attr change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); } else if (!have_delegation) { nfs_ooo_record(nfsi, fattr); nfs_ooo_merge(nfsi, inode_peek_iversion_raw(inode), fattr->change_attr); } inode_set_iversion_raw(inode, fattr->change_attr); } } else { nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_CHANGE; if (!have_delegation || (nfsi->cache_validity & NFS_INO_INVALID_CHANGE) != 0) cache_revalidated = false; } if (fattr->valid & NFS_ATTR_FATTR_MTIME) inode_set_mtime_to_ts(inode, fattr->mtime); else if (fattr_supported & NFS_ATTR_FATTR_MTIME) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_MTIME; if (fattr->valid & NFS_ATTR_FATTR_CTIME) inode_set_ctime_to_ts(inode, fattr->ctime); else if (fattr_supported & NFS_ATTR_FATTR_CTIME) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_CTIME; /* Check if our cached file size is stale */ if (fattr->valid & NFS_ATTR_FATTR_SIZE) { new_isize = nfs_size_to_loff_t(fattr->size); cur_isize = i_size_read(inode); if (new_isize != cur_isize && !have_delegation) { /* Do we perhaps have any outstanding writes, or has * the file grown beyond our last write? */ if (!nfs_have_writebacks(inode) || new_isize > cur_isize) { trace_nfs_size_update(inode, new_isize); i_size_write(inode, new_isize); if (!have_writers) invalid |= NFS_INO_INVALID_DATA; } } if (new_isize == 0 && !(fattr->valid & (NFS_ATTR_FATTR_SPACE_USED | NFS_ATTR_FATTR_BLOCKS_USED))) { fattr->du.nfs3.used = 0; fattr->valid |= NFS_ATTR_FATTR_SPACE_USED; } } else nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_SIZE; if (fattr->valid & NFS_ATTR_FATTR_ATIME) inode_set_atime_to_ts(inode, fattr->atime); else if (fattr_supported & NFS_ATTR_FATTR_ATIME) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_ATIME; if (fattr->valid & NFS_ATTR_FATTR_MODE) { if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) { umode_t newmode = inode->i_mode & S_IFMT; newmode |= fattr->mode & S_IALLUGO; inode->i_mode = newmode; invalid |= NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; } } else if (fattr_supported & NFS_ATTR_FATTR_MODE) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_MODE; if (fattr->valid & NFS_ATTR_FATTR_OWNER) { if (!uid_eq(inode->i_uid, fattr->uid)) { invalid |= NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; inode->i_uid = fattr->uid; } } else if (fattr_supported & NFS_ATTR_FATTR_OWNER) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_OTHER; if (fattr->valid & NFS_ATTR_FATTR_GROUP) { if (!gid_eq(inode->i_gid, fattr->gid)) { invalid |= NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; inode->i_gid = fattr->gid; } } else if (fattr_supported & NFS_ATTR_FATTR_GROUP) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_OTHER; if (fattr->valid & NFS_ATTR_FATTR_NLINK) { if (inode->i_nlink != fattr->nlink) set_nlink(inode, fattr->nlink); } else if (fattr_supported & NFS_ATTR_FATTR_NLINK) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_NLINK; if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { /* * report the blocks in 512byte units */ inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); } else if (fattr_supported & NFS_ATTR_FATTR_SPACE_USED) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_BLOCKS; if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) inode->i_blocks = fattr->du.nfs2.blocks; else if (fattr_supported & NFS_ATTR_FATTR_BLOCKS_USED) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_BLOCKS; /* Update attrtimeo value if we're out of the unstable period */ if (attr_changed) { nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; /* Set barrier to be more recent than all outstanding updates */ nfsi->attr_gencount = nfs_inc_attr_generation_counter(); } else { if (cache_revalidated) { if (!time_in_range_open(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) { nfsi->attrtimeo <<= 1; if (nfsi->attrtimeo > NFS_MAXATTRTIMEO(inode)) nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode); } nfsi->attrtimeo_timestamp = now; } /* Set the barrier to be more recent than this fattr */ if ((long)(fattr->gencount - nfsi->attr_gencount) > 0) nfsi->attr_gencount = fattr->gencount; } /* Don't invalidate the data if we were to blame */ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) invalid &= ~NFS_INO_INVALID_DATA; nfs_set_cache_invalid(inode, invalid); return 0; out_err: /* * No need to worry about unhashing the dentry, as the * lookup validation will know that the inode is bad. * (But we fall through to invalidate the caches.) */ nfs_set_inode_stale_locked(inode); return -ESTALE; } struct inode *nfs_alloc_inode(struct super_block *sb) { struct nfs_inode *nfsi; nfsi = alloc_inode_sb(sb, nfs_inode_cachep, GFP_KERNEL); if (!nfsi) return NULL; nfsi->flags = 0UL; nfsi->cache_validity = 0UL; nfsi->ooo = NULL; #if IS_ENABLED(CONFIG_NFS_V4) nfsi->nfs4_acl = NULL; #endif /* CONFIG_NFS_V4 */ #ifdef CONFIG_NFS_V4_2 nfsi->xattr_cache = NULL; #endif nfs_netfs_inode_init(nfsi); return &nfsi->vfs_inode; } EXPORT_SYMBOL_GPL(nfs_alloc_inode); void nfs_free_inode(struct inode *inode) { kfree(NFS_I(inode)->ooo); kmem_cache_free(nfs_inode_cachep, NFS_I(inode)); } EXPORT_SYMBOL_GPL(nfs_free_inode); static inline void nfs4_init_once(struct nfs_inode *nfsi) { #if IS_ENABLED(CONFIG_NFS_V4) INIT_LIST_HEAD(&nfsi->open_states); nfsi->delegation = NULL; init_rwsem(&nfsi->rwsem); nfsi->layout = NULL; #endif } static void init_once(void *foo) { struct nfs_inode *nfsi = foo; inode_init_once(&nfsi->vfs_inode); INIT_LIST_HEAD(&nfsi->open_files); INIT_LIST_HEAD(&nfsi->access_cache_entry_lru); INIT_LIST_HEAD(&nfsi->access_cache_inode_lru); nfs4_init_once(nfsi); } static int __init nfs_init_inodecache(void) { nfs_inode_cachep = kmem_cache_create("nfs_inode_cache", sizeof(struct nfs_inode), 0, (SLAB_RECLAIM_ACCOUNT| SLAB_ACCOUNT), init_once); if (nfs_inode_cachep == NULL) return -ENOMEM; return 0; } static void nfs_destroy_inodecache(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(nfs_inode_cachep); } struct workqueue_struct *nfslocaliod_workqueue; struct workqueue_struct *nfsiod_workqueue; EXPORT_SYMBOL_GPL(nfsiod_workqueue); /* * Destroy the nfsiod workqueues */ static void nfsiod_stop(void) { struct workqueue_struct *wq; wq = nfsiod_workqueue; if (wq != NULL) { nfsiod_workqueue = NULL; destroy_workqueue(wq); } #if IS_ENABLED(CONFIG_NFS_LOCALIO) wq = nfslocaliod_workqueue; if (wq != NULL) { nfslocaliod_workqueue = NULL; destroy_workqueue(wq); } #endif /* CONFIG_NFS_LOCALIO */ } /* * Start the nfsiod workqueues */ static int nfsiod_start(void) { dprintk("RPC: creating workqueue nfsiod\n"); nfsiod_workqueue = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM | WQ_UNBOUND, 0); if (nfsiod_workqueue == NULL) return -ENOMEM; #if IS_ENABLED(CONFIG_NFS_LOCALIO) /* * localio writes need to use a normal (non-memreclaim) workqueue. * When we start getting low on space, XFS goes and calls flush_work() on * a non-memreclaim work queue, which causes a priority inversion problem. */ dprintk("RPC: creating workqueue nfslocaliod\n"); nfslocaliod_workqueue = alloc_workqueue("nfslocaliod", WQ_UNBOUND, 0); if (unlikely(nfslocaliod_workqueue == NULL)) { nfsiod_stop(); return -ENOMEM; } #endif /* CONFIG_NFS_LOCALIO */ return 0; } unsigned int nfs_net_id; EXPORT_SYMBOL_GPL(nfs_net_id); static int nfs_net_init(struct net *net) { struct nfs_net *nn = net_generic(net, nfs_net_id); nfs_clients_init(net); if (!rpc_proc_register(net, &nn->rpcstats)) { nfs_clients_exit(net); return -ENOMEM; } return nfs_fs_proc_net_init(net); } static void nfs_net_exit(struct net *net) { rpc_proc_unregister(net, "nfs"); nfs_fs_proc_net_exit(net); nfs_clients_exit(net); } static struct pernet_operations nfs_net_ops = { .init = nfs_net_init, .exit = nfs_net_exit, .id = &nfs_net_id, .size = sizeof(struct nfs_net), }; /* * Initialize NFS */ static int __init init_nfs_fs(void) { int err; err = nfs_sysfs_init(); if (err < 0) goto out10; err = register_pernet_subsys(&nfs_net_ops); if (err < 0) goto out9; err = nfsiod_start(); if (err) goto out7; err = nfs_fs_proc_init(); if (err) goto out6; err = nfs_init_nfspagecache(); if (err) goto out5; err = nfs_init_inodecache(); if (err) goto out4; err = nfs_init_readpagecache(); if (err) goto out3; err = nfs_init_writepagecache(); if (err) goto out2; err = nfs_init_directcache(); if (err) goto out1; err = register_nfs_fs(); if (err) goto out0; return 0; out0: nfs_destroy_directcache(); out1: nfs_destroy_writepagecache(); out2: nfs_destroy_readpagecache(); out3: nfs_destroy_inodecache(); out4: nfs_destroy_nfspagecache(); out5: nfs_fs_proc_exit(); out6: nfsiod_stop(); out7: unregister_pernet_subsys(&nfs_net_ops); out9: nfs_sysfs_exit(); out10: return err; } static void __exit exit_nfs_fs(void) { nfs_destroy_directcache(); nfs_destroy_writepagecache(); nfs_destroy_readpagecache(); nfs_destroy_inodecache(); nfs_destroy_nfspagecache(); unregister_pernet_subsys(&nfs_net_ops); unregister_nfs_fs(); nfs_fs_proc_exit(); nfsiod_stop(); nfs_sysfs_exit(); } /* Not quite true; I just maintain it */ MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>"); MODULE_DESCRIPTION("NFS client support"); MODULE_LICENSE("GPL"); module_param(enable_ino64, bool, 0644); module_init(init_nfs_fs) module_exit(exit_nfs_fs)
25 25 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 // SPDX-License-Identifier: GPL-2.0-only /* * crc16.c */ #include <linux/types.h> #include <linux/module.h> #include <linux/crc16.h> /** CRC table for the CRC-16. The poly is 0x8005 (x^16 + x^15 + x^2 + 1) */ u16 const crc16_table[256] = { 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241, 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440, 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40, 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841, 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40, 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41, 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641, 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040, 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240, 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441, 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41, 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840, 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41, 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40, 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640, 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041, 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240, 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441, 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41, 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840, 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41, 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40, 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640, 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041, 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241, 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440, 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40, 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841, 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40, 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41, 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641, 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040 }; EXPORT_SYMBOL(crc16_table); /** * crc16 - compute the CRC-16 for the data buffer * @crc: previous CRC value * @buffer: data pointer * @len: number of bytes in the buffer * * Returns the updated CRC value. */ u16 crc16(u16 crc, u8 const *buffer, size_t len) { while (len--) crc = crc16_byte(crc, *buffer++); return crc; } EXPORT_SYMBOL(crc16); MODULE_DESCRIPTION("CRC16 calculations"); MODULE_LICENSE("GPL");
470 18543 14798 14387 33 14709 6376 8800 380 379 17 3641 3637 69 390 117 48 39 505 297 19 361 438 91 10 2 4579 2398 667 74 2034 25 5972 6429 7 1622 145 34 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_LIST_H #define _LINUX_LIST_H #include <linux/container_of.h> #include <linux/types.h> #include <linux/stddef.h> #include <linux/poison.h> #include <linux/const.h> #include <asm/barrier.h> /* * Circular doubly linked list implementation. * * Some of the internal functions ("__xxx") are useful when * manipulating whole lists rather than single entries, as * sometimes we already know the next/prev entries and we can * generate better code by using them directly rather than * using the generic single-entry routines. */ #define LIST_HEAD_INIT(name) { &(name), &(name) } #define LIST_HEAD(name) \ struct list_head name = LIST_HEAD_INIT(name) /** * INIT_LIST_HEAD - Initialize a list_head structure * @list: list_head structure to be initialized. * * Initializes the list_head to point to itself. If it is a list header, * the result is an empty list. */ static inline void INIT_LIST_HEAD(struct list_head *list) { WRITE_ONCE(list->next, list); WRITE_ONCE(list->prev, list); } #ifdef CONFIG_LIST_HARDENED #ifdef CONFIG_DEBUG_LIST # define __list_valid_slowpath #else # define __list_valid_slowpath __cold __preserve_most #endif /* * Performs the full set of list corruption checks before __list_add(). * On list corruption reports a warning, and returns false. */ extern bool __list_valid_slowpath __list_add_valid_or_report(struct list_head *new, struct list_head *prev, struct list_head *next); /* * Performs list corruption checks before __list_add(). Returns false if a * corruption is detected, true otherwise. * * With CONFIG_LIST_HARDENED only, performs minimal list integrity checking * inline to catch non-faulting corruptions, and only if a corruption is * detected calls the reporting function __list_add_valid_or_report(). */ static __always_inline bool __list_add_valid(struct list_head *new, struct list_head *prev, struct list_head *next) { bool ret = true; if (!IS_ENABLED(CONFIG_DEBUG_LIST)) { /* * With the hardening version, elide checking if next and prev * are NULL, since the immediate dereference of them below would * result in a fault if NULL. * * With the reduced set of checks, we can afford to inline the * checks, which also gives the compiler a chance to elide some * of them completely if they can be proven at compile-time. If * one of the pre-conditions does not hold, the slow-path will * show a report which pre-condition failed. */ if (likely(next->prev == prev && prev->next == next && new != prev && new != next)) return true; ret = false; } ret &= __list_add_valid_or_report(new, prev, next); return ret; } /* * Performs the full set of list corruption checks before __list_del_entry(). * On list corruption reports a warning, and returns false. */ extern bool __list_valid_slowpath __list_del_entry_valid_or_report(struct list_head *entry); /* * Performs list corruption checks before __list_del_entry(). Returns false if a * corruption is detected, true otherwise. * * With CONFIG_LIST_HARDENED only, performs minimal list integrity checking * inline to catch non-faulting corruptions, and only if a corruption is * detected calls the reporting function __list_del_entry_valid_or_report(). */ static __always_inline bool __list_del_entry_valid(struct list_head *entry) { bool ret = true; if (!IS_ENABLED(CONFIG_DEBUG_LIST)) { struct list_head *prev = entry->prev; struct list_head *next = entry->next; /* * With the hardening version, elide checking if next and prev * are NULL, LIST_POISON1 or LIST_POISON2, since the immediate * dereference of them below would result in a fault. */ if (likely(prev->next == entry && next->prev == entry)) return true; ret = false; } ret &= __list_del_entry_valid_or_report(entry); return ret; } #else static inline bool __list_add_valid(struct list_head *new, struct list_head *prev, struct list_head *next) { return true; } static inline bool __list_del_entry_valid(struct list_head *entry) { return true; } #endif /* * Insert a new entry between two known consecutive entries. * * This is only for internal list manipulation where we know * the prev/next entries already! */ static inline void __list_add(struct list_head *new, struct list_head *prev, struct list_head *next) { if (!__list_add_valid(new, prev, next)) return; next->prev = new; new->next = next; new->prev = prev; WRITE_ONCE(prev->next, new); } /** * list_add - add a new entry * @new: new entry to be added * @head: list head to add it after * * Insert a new entry after the specified head. * This is good for implementing stacks. */ static inline void list_add(struct list_head *new, struct list_head *head) { __list_add(new, head, head->next); } /** * list_add_tail - add a new entry * @new: new entry to be added * @head: list head to add it before * * Insert a new entry before the specified head. * This is useful for implementing queues. */ static inline void list_add_tail(struct list_head *new, struct list_head *head) { __list_add(new, head->prev, head); } /* * Delete a list entry by making the prev/next entries * point to each other. * * This is only for internal list manipulation where we know * the prev/next entries already! */ static inline void __list_del(struct list_head * prev, struct list_head * next) { next->prev = prev; WRITE_ONCE(prev->next, next); } /* * Delete a list entry and clear the 'prev' pointer. * * This is a special-purpose list clearing method used in the networking code * for lists allocated as per-cpu, where we don't want to incur the extra * WRITE_ONCE() overhead of a regular list_del_init(). The code that uses this * needs to check the node 'prev' pointer instead of calling list_empty(). */ static inline void __list_del_clearprev(struct list_head *entry) { __list_del(entry->prev, entry->next); entry->prev = NULL; } static inline void __list_del_entry(struct list_head *entry) { if (!__list_del_entry_valid(entry)) return; __list_del(entry->prev, entry->next); } /** * list_del - deletes entry from list. * @entry: the element to delete from the list. * Note: list_empty() on entry does not return true after this, the entry is * in an undefined state. */ static inline void list_del(struct list_head *entry) { __list_del_entry(entry); entry->next = LIST_POISON1; entry->prev = LIST_POISON2; } /** * list_replace - replace old entry by new one * @old : the element to be replaced * @new : the new element to insert * * If @old was empty, it will be overwritten. */ static inline void list_replace(struct list_head *old, struct list_head *new) { new->next = old->next; new->next->prev = new; new->prev = old->prev; new->prev->next = new; } /** * list_replace_init - replace old entry by new one and initialize the old one * @old : the element to be replaced * @new : the new element to insert * * If @old was empty, it will be overwritten. */ static inline void list_replace_init(struct list_head *old, struct list_head *new) { list_replace(old, new); INIT_LIST_HEAD(old); } /** * list_swap - replace entry1 with entry2 and re-add entry1 at entry2's position * @entry1: the location to place entry2 * @entry2: the location to place entry1 */ static inline void list_swap(struct list_head *entry1, struct list_head *entry2) { struct list_head *pos = entry2->prev; list_del(entry2); list_replace(entry1, entry2); if (pos == entry1) pos = entry2; list_add(entry1, pos); } /** * list_del_init - deletes entry from list and reinitialize it. * @entry: the element to delete from the list. */ static inline void list_del_init(struct list_head *entry) { __list_del_entry(entry); INIT_LIST_HEAD(entry); } /** * list_move - delete from one list and add as another's head * @list: the entry to move * @head: the head that will precede our entry */ static inline void list_move(struct list_head *list, struct list_head *head) { __list_del_entry(list); list_add(list, head); } /** * list_move_tail - delete from one list and add as another's tail * @list: the entry to move * @head: the head that will follow our entry */ static inline void list_move_tail(struct list_head *list, struct list_head *head) { __list_del_entry(list); list_add_tail(list, head); } /** * list_bulk_move_tail - move a subsection of a list to its tail * @head: the head that will follow our entry * @first: first entry to move * @last: last entry to move, can be the same as first * * Move all entries between @first and including @last before @head. * All three entries must belong to the same linked list. */ static inline void list_bulk_move_tail(struct list_head *head, struct list_head *first, struct list_head *last) { first->prev->next = last->next; last->next->prev = first->prev; head->prev->next = first; first->prev = head->prev; last->next = head; head->prev = last; } /** * list_is_first -- tests whether @list is the first entry in list @head * @list: the entry to test * @head: the head of the list */ static inline int list_is_first(const struct list_head *list, const struct list_head *head) { return list->prev == head; } /** * list_is_last - tests whether @list is the last entry in list @head * @list: the entry to test * @head: the head of the list */ static inline int list_is_last(const struct list_head *list, const struct list_head *head) { return list->next == head; } /** * list_is_head - tests whether @list is the list @head * @list: the entry to test * @head: the head of the list */ static inline int list_is_head(const struct list_head *list, const struct list_head *head) { return list == head; } /** * list_empty - tests whether a list is empty * @head: the list to test. */ static inline int list_empty(const struct list_head *head) { return READ_ONCE(head->next) == head; } /** * list_del_init_careful - deletes entry from list and reinitialize it. * @entry: the element to delete from the list. * * This is the same as list_del_init(), except designed to be used * together with list_empty_careful() in a way to guarantee ordering * of other memory operations. * * Any memory operations done before a list_del_init_careful() are * guaranteed to be visible after a list_empty_careful() test. */ static inline void list_del_init_careful(struct list_head *entry) { __list_del_entry(entry); WRITE_ONCE(entry->prev, entry); smp_store_release(&entry->next, entry); } /** * list_empty_careful - tests whether a list is empty and not being modified * @head: the list to test * * Description: * tests whether a list is empty _and_ checks that no other CPU might be * in the process of modifying either member (next or prev) * * NOTE: using list_empty_careful() without synchronization * can only be safe if the only activity that can happen * to the list entry is list_del_init(). Eg. it cannot be used * if another CPU could re-list_add() it. */ static inline int list_empty_careful(const struct list_head *head) { struct list_head *next = smp_load_acquire(&head->next); return list_is_head(next, head) && (next == READ_ONCE(head->prev)); } /** * list_rotate_left - rotate the list to the left * @head: the head of the list */ static inline void list_rotate_left(struct list_head *head) { struct list_head *first; if (!list_empty(head)) { first = head->next; list_move_tail(first, head); } } /** * list_rotate_to_front() - Rotate list to specific item. * @list: The desired new front of the list. * @head: The head of the list. * * Rotates list so that @list becomes the new front of the list. */ static inline void list_rotate_to_front(struct list_head *list, struct list_head *head) { /* * Deletes the list head from the list denoted by @head and * places it as the tail of @list, this effectively rotates the * list so that @list is at the front. */ list_move_tail(head, list); } /** * list_is_singular - tests whether a list has just one entry. * @head: the list to test. */ static inline int list_is_singular(const struct list_head *head) { return !list_empty(head) && (head->next == head->prev); } static inline void __list_cut_position(struct list_head *list, struct list_head *head, struct list_head *entry) { struct list_head *new_first = entry->next; list->next = head->next; list->next->prev = list; list->prev = entry; entry->next = list; head->next = new_first; new_first->prev = head; } /** * list_cut_position - cut a list into two * @list: a new list to add all removed entries * @head: a list with entries * @entry: an entry within head, could be the head itself * and if so we won't cut the list * * This helper moves the initial part of @head, up to and * including @entry, from @head to @list. You should * pass on @entry an element you know is on @head. @list * should be an empty list or a list you do not care about * losing its data. * */ static inline void list_cut_position(struct list_head *list, struct list_head *head, struct list_head *entry) { if (list_empty(head)) return; if (list_is_singular(head) && !list_is_head(entry, head) && (entry != head->next)) return; if (list_is_head(entry, head)) INIT_LIST_HEAD(list); else __list_cut_position(list, head, entry); } /** * list_cut_before - cut a list into two, before given entry * @list: a new list to add all removed entries * @head: a list with entries * @entry: an entry within head, could be the head itself * * This helper moves the initial part of @head, up to but * excluding @entry, from @head to @list. You should pass * in @entry an element you know is on @head. @list should * be an empty list or a list you do not care about losing * its data. * If @entry == @head, all entries on @head are moved to * @list. */ static inline void list_cut_before(struct list_head *list, struct list_head *head, struct list_head *entry) { if (head->next == entry) { INIT_LIST_HEAD(list); return; } list->next = head->next; list->next->prev = list; list->prev = entry->prev; list->prev->next = list; head->next = entry; entry->prev = head; } static inline void __list_splice(const struct list_head *list, struct list_head *prev, struct list_head *next) { struct list_head *first = list->next; struct list_head *last = list->prev; first->prev = prev; prev->next = first; last->next = next; next->prev = last; } /** * list_splice - join two lists, this is designed for stacks * @list: the new list to add. * @head: the place to add it in the first list. */ static inline void list_splice(const struct list_head *list, struct list_head *head) { if (!list_empty(list)) __list_splice(list, head, head->next); } /** * list_splice_tail - join two lists, each list being a queue * @list: the new list to add. * @head: the place to add it in the first list. */ static inline void list_splice_tail(struct list_head *list, struct list_head *head) { if (!list_empty(list)) __list_splice(list, head->prev, head); } /** * list_splice_init - join two lists and reinitialise the emptied list. * @list: the new list to add. * @head: the place to add it in the first list. * * The list at @list is reinitialised */ static inline void list_splice_init(struct list_head *list, struct list_head *head) { if (!list_empty(list)) { __list_splice(list, head, head->next); INIT_LIST_HEAD(list); } } /** * list_splice_tail_init - join two lists and reinitialise the emptied list * @list: the new list to add. * @head: the place to add it in the first list. * * Each of the lists is a queue. * The list at @list is reinitialised */ static inline void list_splice_tail_init(struct list_head *list, struct list_head *head) { if (!list_empty(list)) { __list_splice(list, head->prev, head); INIT_LIST_HEAD(list); } } /** * list_entry - get the struct for this entry * @ptr: the &struct list_head pointer. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. */ #define list_entry(ptr, type, member) \ container_of(ptr, type, member) /** * list_first_entry - get the first element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note, that list is expected to be not empty. */ #define list_first_entry(ptr, type, member) \ list_entry((ptr)->next, type, member) /** * list_last_entry - get the last element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note, that list is expected to be not empty. */ #define list_last_entry(ptr, type, member) \ list_entry((ptr)->prev, type, member) /** * list_first_entry_or_null - get the first element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note that if the list is empty, it returns NULL. */ #define list_first_entry_or_null(ptr, type, member) ({ \ struct list_head *head__ = (ptr); \ struct list_head *pos__ = READ_ONCE(head__->next); \ pos__ != head__ ? list_entry(pos__, type, member) : NULL; \ }) /** * list_next_entry - get the next element in list * @pos: the type * to cursor * @member: the name of the list_head within the struct. */ #define list_next_entry(pos, member) \ list_entry((pos)->member.next, typeof(*(pos)), member) /** * list_next_entry_circular - get the next element in list * @pos: the type * to cursor. * @head: the list head to take the element from. * @member: the name of the list_head within the struct. * * Wraparound if pos is the last element (return the first element). * Note, that list is expected to be not empty. */ #define list_next_entry_circular(pos, head, member) \ (list_is_last(&(pos)->member, head) ? \ list_first_entry(head, typeof(*(pos)), member) : list_next_entry(pos, member)) /** * list_prev_entry - get the prev element in list * @pos: the type * to cursor * @member: the name of the list_head within the struct. */ #define list_prev_entry(pos, member) \ list_entry((pos)->member.prev, typeof(*(pos)), member) /** * list_prev_entry_circular - get the prev element in list * @pos: the type * to cursor. * @head: the list head to take the element from. * @member: the name of the list_head within the struct. * * Wraparound if pos is the first element (return the last element). * Note, that list is expected to be not empty. */ #define list_prev_entry_circular(pos, head, member) \ (list_is_first(&(pos)->member, head) ? \ list_last_entry(head, typeof(*(pos)), member) : list_prev_entry(pos, member)) /** * list_for_each - iterate over a list * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. */ #define list_for_each(pos, head) \ for (pos = (head)->next; !list_is_head(pos, (head)); pos = pos->next) /** * list_for_each_rcu - Iterate over a list in an RCU-safe fashion * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. */ #define list_for_each_rcu(pos, head) \ for (pos = rcu_dereference((head)->next); \ !list_is_head(pos, (head)); \ pos = rcu_dereference(pos->next)) /** * list_for_each_continue - continue iteration over a list * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. * * Continue to iterate over a list, continuing after the current position. */ #define list_for_each_continue(pos, head) \ for (pos = pos->next; !list_is_head(pos, (head)); pos = pos->next) /** * list_for_each_prev - iterate over a list backwards * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. */ #define list_for_each_prev(pos, head) \ for (pos = (head)->prev; !list_is_head(pos, (head)); pos = pos->prev) /** * list_for_each_safe - iterate over a list safe against removal of list entry * @pos: the &struct list_head to use as a loop cursor. * @n: another &struct list_head to use as temporary storage * @head: the head for your list. */ #define list_for_each_safe(pos, n, head) \ for (pos = (head)->next, n = pos->next; \ !list_is_head(pos, (head)); \ pos = n, n = pos->next) /** * list_for_each_prev_safe - iterate over a list backwards safe against removal of list entry * @pos: the &struct list_head to use as a loop cursor. * @n: another &struct list_head to use as temporary storage * @head: the head for your list. */ #define list_for_each_prev_safe(pos, n, head) \ for (pos = (head)->prev, n = pos->prev; \ !list_is_head(pos, (head)); \ pos = n, n = pos->prev) /** * list_count_nodes - count nodes in the list * @head: the head for your list. */ static inline size_t list_count_nodes(struct list_head *head) { struct list_head *pos; size_t count = 0; list_for_each(pos, head) count++; return count; } /** * list_entry_is_head - test if the entry points to the head of the list * @pos: the type * to cursor * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_entry_is_head(pos, head, member) \ list_is_head(&pos->member, (head)) /** * list_for_each_entry - iterate over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_for_each_entry(pos, head, member) \ for (pos = list_first_entry(head, typeof(*pos), member); \ !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** * list_for_each_entry_reverse - iterate backwards over list of given type. * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_for_each_entry_reverse(pos, head, member) \ for (pos = list_last_entry(head, typeof(*pos), member); \ !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** * list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue() * @pos: the type * to use as a start point * @head: the head of the list * @member: the name of the list_head within the struct. * * Prepares a pos entry for use as a start point in list_for_each_entry_continue(). */ #define list_prepare_entry(pos, head, member) \ ((pos) ? : list_entry(head, typeof(*pos), member)) /** * list_for_each_entry_continue - continue iteration over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Continue to iterate over list of given type, continuing after * the current position. */ #define list_for_each_entry_continue(pos, head, member) \ for (pos = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** * list_for_each_entry_continue_reverse - iterate backwards from the given point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Start to iterate over list of given type backwards, continuing after * the current position. */ #define list_for_each_entry_continue_reverse(pos, head, member) \ for (pos = list_prev_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** * list_for_each_entry_from - iterate over list of given type from the current point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate over list of given type, continuing from current position. */ #define list_for_each_entry_from(pos, head, member) \ for (; !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** * list_for_each_entry_from_reverse - iterate backwards over list of given type * from the current point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate backwards over list of given type, continuing from current position. */ #define list_for_each_entry_from_reverse(pos, head, member) \ for (; !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_for_each_entry_safe(pos, n, head, member) \ for (pos = list_first_entry(head, typeof(*pos), member), \ n = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_continue - continue list iteration safe against removal * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate over list of given type, continuing after current point, * safe against removal of list entry. */ #define list_for_each_entry_safe_continue(pos, n, head, member) \ for (pos = list_next_entry(pos, member), \ n = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_from - iterate over list from current point safe against removal * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate over list of given type from current point, safe against * removal of list entry. */ #define list_for_each_entry_safe_from(pos, n, head, member) \ for (n = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_reverse - iterate backwards over list safe against removal * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate backwards over list of given type, safe against removal * of list entry. */ #define list_for_each_entry_safe_reverse(pos, n, head, member) \ for (pos = list_last_entry(head, typeof(*pos), member), \ n = list_prev_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_prev_entry(n, member)) /** * list_safe_reset_next - reset a stale list_for_each_entry_safe loop * @pos: the loop cursor used in the list_for_each_entry_safe loop * @n: temporary storage used in list_for_each_entry_safe * @member: the name of the list_head within the struct. * * list_safe_reset_next is not safe to use in general if the list may be * modified concurrently (eg. the lock is dropped in the loop body). An * exception to this is if the cursor element (pos) is pinned in the list, * and list_safe_reset_next is called after re-taking the lock and before * completing the current iteration of the loop body. */ #define list_safe_reset_next(pos, n, member) \ n = list_next_entry(pos, member) /* * Double linked lists with a single pointer list head. * Mostly useful for hash tables where the two pointer list head is * too wasteful. * You lose the ability to access the tail in O(1). */ #define HLIST_HEAD_INIT { .first = NULL } #define HLIST_HEAD(name) struct hlist_head name = { .first = NULL } #define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL) static inline void INIT_HLIST_NODE(struct hlist_node *h) { h->next = NULL; h->pprev = NULL; } /** * hlist_unhashed - Has node been removed from list and reinitialized? * @h: Node to be checked * * Not that not all removal functions will leave a node in unhashed * state. For example, hlist_nulls_del_init_rcu() does leave the * node in unhashed state, but hlist_nulls_del() does not. */ static inline int hlist_unhashed(const struct hlist_node *h) { return !h->pprev; } /** * hlist_unhashed_lockless - Version of hlist_unhashed for lockless use * @h: Node to be checked * * This variant of hlist_unhashed() must be used in lockless contexts * to avoid potential load-tearing. The READ_ONCE() is paired with the * various WRITE_ONCE() in hlist helpers that are defined below. */ static inline int hlist_unhashed_lockless(const struct hlist_node *h) { return !READ_ONCE(h->pprev); } /** * hlist_empty - Is the specified hlist_head structure an empty hlist? * @h: Structure to check. */ static inline int hlist_empty(const struct hlist_head *h) { return !READ_ONCE(h->first); } static inline void __hlist_del(struct hlist_node *n) { struct hlist_node *next = n->next; struct hlist_node **pprev = n->pprev; WRITE_ONCE(*pprev, next); if (next) WRITE_ONCE(next->pprev, pprev); } /** * hlist_del - Delete the specified hlist_node from its list * @n: Node to delete. * * Note that this function leaves the node in hashed state. Use * hlist_del_init() or similar instead to unhash @n. */ static inline void hlist_del(struct hlist_node *n) { __hlist_del(n); n->next = LIST_POISON1; n->pprev = LIST_POISON2; } /** * hlist_del_init - Delete the specified hlist_node from its list and initialize * @n: Node to delete. * * Note that this function leaves the node in unhashed state. */ static inline void hlist_del_init(struct hlist_node *n) { if (!hlist_unhashed(n)) { __hlist_del(n); INIT_HLIST_NODE(n); } } /** * hlist_add_head - add a new entry at the beginning of the hlist * @n: new entry to be added * @h: hlist head to add it after * * Insert a new entry after the specified head. * This is good for implementing stacks. */ static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h) { struct hlist_node *first = h->first; WRITE_ONCE(n->next, first); if (first) WRITE_ONCE(first->pprev, &n->next); WRITE_ONCE(h->first, n); WRITE_ONCE(n->pprev, &h->first); } /** * hlist_add_before - add a new entry before the one specified * @n: new entry to be added * @next: hlist node to add it before, which must be non-NULL */ static inline void hlist_add_before(struct hlist_node *n, struct hlist_node *next) { WRITE_ONCE(n->pprev, next->pprev); WRITE_ONCE(n->next, next); WRITE_ONCE(next->pprev, &n->next); WRITE_ONCE(*(n->pprev), n); } /** * hlist_add_behind - add a new entry after the one specified * @n: new entry to be added * @prev: hlist node to add it after, which must be non-NULL */ static inline void hlist_add_behind(struct hlist_node *n, struct hlist_node *prev) { WRITE_ONCE(n->next, prev->next); WRITE_ONCE(prev->next, n); WRITE_ONCE(n->pprev, &prev->next); if (n->next) WRITE_ONCE(n->next->pprev, &n->next); } /** * hlist_add_fake - create a fake hlist consisting of a single headless node * @n: Node to make a fake list out of * * This makes @n appear to be its own predecessor on a headless hlist. * The point of this is to allow things like hlist_del() to work correctly * in cases where there is no list. */ static inline void hlist_add_fake(struct hlist_node *n) { n->pprev = &n->next; } /** * hlist_fake: Is this node a fake hlist? * @h: Node to check for being a self-referential fake hlist. */ static inline bool hlist_fake(struct hlist_node *h) { return h->pprev == &h->next; } /** * hlist_is_singular_node - is node the only element of the specified hlist? * @n: Node to check for singularity. * @h: Header for potentially singular list. * * Check whether the node is the only node of the head without * accessing head, thus avoiding unnecessary cache misses. */ static inline bool hlist_is_singular_node(struct hlist_node *n, struct hlist_head *h) { return !n->next && n->pprev == &h->first; } /** * hlist_move_list - Move an hlist * @old: hlist_head for old list. * @new: hlist_head for new list. * * Move a list from one list head to another. Fixup the pprev * reference of the first entry if it exists. */ static inline void hlist_move_list(struct hlist_head *old, struct hlist_head *new) { new->first = old->first; if (new->first) new->first->pprev = &new->first; old->first = NULL; } /** * hlist_splice_init() - move all entries from one list to another * @from: hlist_head from which entries will be moved * @last: last entry on the @from list * @to: hlist_head to which entries will be moved * * @to can be empty, @from must contain at least @last. */ static inline void hlist_splice_init(struct hlist_head *from, struct hlist_node *last, struct hlist_head *to) { if (to->first) to->first->pprev = &last->next; last->next = to->first; to->first = from->first; from->first->pprev = &to->first; from->first = NULL; } #define hlist_entry(ptr, type, member) container_of(ptr,type,member) #define hlist_for_each(pos, head) \ for (pos = (head)->first; pos ; pos = pos->next) #define hlist_for_each_safe(pos, n, head) \ for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \ pos = n) #define hlist_entry_safe(ptr, type, member) \ ({ typeof(ptr) ____ptr = (ptr); \ ____ptr ? hlist_entry(____ptr, type, member) : NULL; \ }) /** * hlist_for_each_entry - iterate over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry(pos, head, member) \ for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member);\ pos; \ pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) /** * hlist_for_each_entry_continue - iterate over a hlist continuing after current point * @pos: the type * to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_continue(pos, member) \ for (pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member);\ pos; \ pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) /** * hlist_for_each_entry_from - iterate over a hlist continuing from current point * @pos: the type * to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_from(pos, member) \ for (; pos; \ pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) /** * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry * @pos: the type * to use as a loop cursor. * @n: a &struct hlist_node to use as temporary storage * @head: the head for your list. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_safe(pos, n, head, member) \ for (pos = hlist_entry_safe((head)->first, typeof(*pos), member);\ pos && ({ n = pos->member.next; 1; }); \ pos = hlist_entry_safe(n, typeof(*pos), member)) /** * hlist_count_nodes - count nodes in the hlist * @head: the head for your hlist. */ static inline size_t hlist_count_nodes(struct hlist_head *head) { struct hlist_node *pos; size_t count = 0; hlist_for_each(pos, head) count++; return count; } #endif
32 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM smc #if !defined(_TRACE_SMC_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SMC_H #include <linux/ipv6.h> #include <linux/tcp.h> #include <linux/tracepoint.h> #include <net/ipv6.h> #include "smc.h" #include "smc_core.h" TRACE_EVENT(smc_switch_to_fallback, TP_PROTO(const struct smc_sock *smc, int fallback_rsn), TP_ARGS(smc, fallback_rsn), TP_STRUCT__entry( __field(const void *, sk) __field(const void *, clcsk) __field(u64, net_cookie) __field(int, fallback_rsn) ), TP_fast_assign( const struct sock *sk = &smc->sk; const struct sock *clcsk = smc->clcsock->sk; __entry->sk = sk; __entry->clcsk = clcsk; __entry->net_cookie = sock_net(sk)->net_cookie; __entry->fallback_rsn = fallback_rsn; ), TP_printk("sk=%p clcsk=%p net=%llu fallback_rsn=%d", __entry->sk, __entry->clcsk, __entry->net_cookie, __entry->fallback_rsn) ); DECLARE_EVENT_CLASS(smc_msg_event, TP_PROTO(const struct smc_sock *smc, size_t len), TP_ARGS(smc, len), TP_STRUCT__entry( __field(const void *, smc) __field(u64, net_cookie) __field(size_t, len) __string(name, smc->conn.lnk->ibname) ), TP_fast_assign( const struct sock *sk = &smc->sk; __entry->smc = smc; __entry->net_cookie = sock_net(sk)->net_cookie; __entry->len = len; __assign_str(name); ), TP_printk("smc=%p net=%llu len=%zu dev=%s", __entry->smc, __entry->net_cookie, __entry->len, __get_str(name)) ); DEFINE_EVENT(smc_msg_event, smc_tx_sendmsg, TP_PROTO(const struct smc_sock *smc, size_t len), TP_ARGS(smc, len) ); DEFINE_EVENT(smc_msg_event, smc_rx_recvmsg, TP_PROTO(const struct smc_sock *smc, size_t len), TP_ARGS(smc, len) ); TRACE_EVENT(smcr_link_down, TP_PROTO(const struct smc_link *lnk, void *location), TP_ARGS(lnk, location), TP_STRUCT__entry( __field(const void *, lnk) __field(const void *, lgr) __field(u64, net_cookie) __field(int, state) __string(name, lnk->ibname) __field(void *, location) ), TP_fast_assign( const struct smc_link_group *lgr = lnk->lgr; __entry->lnk = lnk; __entry->lgr = lgr; __entry->net_cookie = lgr->net->net_cookie; __entry->state = lnk->state; __assign_str(name); __entry->location = location; ), TP_printk("lnk=%p lgr=%p net=%llu state=%d dev=%s location=%pS", __entry->lnk, __entry->lgr, __entry->net_cookie, __entry->state, __get_str(name), __entry->location) ); #endif /* _TRACE_SMC_H */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE smc_tracepoint #include <trace/define_trace.h>
115 114 115 1 1 1 1 16 16 16 16 16 16 16 9 15 16 14 9 16 16 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 // SPDX-License-Identifier: GPL-2.0 /* * Central processing for nfsd. * * Authors: Olaf Kirch (okir@monad.swb.de) * * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> */ #include <linux/sched/signal.h> #include <linux/freezer.h> #include <linux/module.h> #include <linux/fs_struct.h> #include <linux/swap.h> #include <linux/siphash.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/svcsock.h> #include <linux/sunrpc/svc_xprt.h> #include <linux/lockd/bind.h> #include <linux/nfsacl.h> #include <linux/nfslocalio.h> #include <linux/seq_file.h> #include <linux/inetdevice.h> #include <net/addrconf.h> #include <net/ipv6.h> #include <net/net_namespace.h> #include "nfsd.h" #include "cache.h" #include "vfs.h" #include "netns.h" #include "filecache.h" #include "trace.h" #define NFSDDBG_FACILITY NFSDDBG_SVC atomic_t nfsd_th_cnt = ATOMIC_INIT(0); static int nfsd(void *vrqstp); #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) static int nfsd_acl_rpcbind_set(struct net *, const struct svc_program *, u32, int, unsigned short, unsigned short); static __be32 nfsd_acl_init_request(struct svc_rqst *, const struct svc_program *, struct svc_process_info *); #endif static int nfsd_rpcbind_set(struct net *, const struct svc_program *, u32, int, unsigned short, unsigned short); static __be32 nfsd_init_request(struct svc_rqst *, const struct svc_program *, struct svc_process_info *); /* * nfsd_mutex protects nn->nfsd_serv -- both the pointer itself and some members * of the svc_serv struct such as ->sv_temp_socks and ->sv_permsocks. * * Finally, the nfsd_mutex also protects some of the global variables that are * accessed when nfsd starts and that are settable via the write_* routines in * nfsctl.c. In particular: * * user_recovery_dirname * user_lease_time * nfsd_versions */ DEFINE_MUTEX(nfsd_mutex); #if IS_ENABLED(CONFIG_NFS_LOCALIO) static const struct svc_version *localio_versions[] = { [1] = &localio_version1, }; #define NFSD_LOCALIO_NRVERS ARRAY_SIZE(localio_versions) #endif /* CONFIG_NFS_LOCALIO */ #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) static const struct svc_version *nfsd_acl_version[] = { # if defined(CONFIG_NFSD_V2_ACL) [2] = &nfsd_acl_version2, # endif # if defined(CONFIG_NFSD_V3_ACL) [3] = &nfsd_acl_version3, # endif }; #define NFSD_ACL_MINVERS 2 #define NFSD_ACL_NRVERS ARRAY_SIZE(nfsd_acl_version) #endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */ static const struct svc_version *nfsd_version[NFSD_MAXVERS+1] = { #if defined(CONFIG_NFSD_V2) [2] = &nfsd_version2, #endif [3] = &nfsd_version3, #if defined(CONFIG_NFSD_V4) [4] = &nfsd_version4, #endif }; struct svc_program nfsd_programs[] = { { .pg_prog = NFS_PROGRAM, /* program number */ .pg_nvers = NFSD_MAXVERS+1, /* nr of entries in nfsd_version */ .pg_vers = nfsd_version, /* version table */ .pg_name = "nfsd", /* program name */ .pg_class = "nfsd", /* authentication class */ .pg_authenticate = svc_set_client, /* export authentication */ .pg_init_request = nfsd_init_request, .pg_rpcbind_set = nfsd_rpcbind_set, }, #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) { .pg_prog = NFS_ACL_PROGRAM, .pg_nvers = NFSD_ACL_NRVERS, .pg_vers = nfsd_acl_version, .pg_name = "nfsacl", .pg_class = "nfsd", .pg_authenticate = svc_set_client, .pg_init_request = nfsd_acl_init_request, .pg_rpcbind_set = nfsd_acl_rpcbind_set, }, #endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */ #if IS_ENABLED(CONFIG_NFS_LOCALIO) { .pg_prog = NFS_LOCALIO_PROGRAM, .pg_nvers = NFSD_LOCALIO_NRVERS, .pg_vers = localio_versions, .pg_name = "nfslocalio", .pg_class = "nfsd", .pg_authenticate = svc_set_client, .pg_init_request = svc_generic_init_request, .pg_rpcbind_set = svc_generic_rpcbind_set, } #endif /* CONFIG_NFS_LOCALIO */ }; bool nfsd_support_version(int vers) { if (vers >= NFSD_MINVERS && vers <= NFSD_MAXVERS) return nfsd_version[vers] != NULL; return false; } int nfsd_vers(struct nfsd_net *nn, int vers, enum vers_op change) { if (vers < NFSD_MINVERS || vers > NFSD_MAXVERS) return 0; switch(change) { case NFSD_SET: nn->nfsd_versions[vers] = nfsd_support_version(vers); break; case NFSD_CLEAR: nn->nfsd_versions[vers] = false; break; case NFSD_TEST: return nn->nfsd_versions[vers]; case NFSD_AVAIL: return nfsd_support_version(vers); } return 0; } static void nfsd_adjust_nfsd_versions4(struct nfsd_net *nn) { unsigned i; for (i = 0; i <= NFSD_SUPPORTED_MINOR_VERSION; i++) { if (nn->nfsd4_minorversions[i]) return; } nfsd_vers(nn, 4, NFSD_CLEAR); } int nfsd_minorversion(struct nfsd_net *nn, u32 minorversion, enum vers_op change) { if (minorversion > NFSD_SUPPORTED_MINOR_VERSION && change != NFSD_AVAIL) return -1; switch(change) { case NFSD_SET: nfsd_vers(nn, 4, NFSD_SET); nn->nfsd4_minorversions[minorversion] = nfsd_vers(nn, 4, NFSD_TEST); break; case NFSD_CLEAR: nn->nfsd4_minorversions[minorversion] = false; nfsd_adjust_nfsd_versions4(nn); break; case NFSD_TEST: return nn->nfsd4_minorversions[minorversion]; case NFSD_AVAIL: return minorversion <= NFSD_SUPPORTED_MINOR_VERSION && nfsd_vers(nn, 4, NFSD_AVAIL); } return 0; } bool nfsd_net_try_get(struct net *net) __must_hold(rcu) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); return (nn && percpu_ref_tryget_live(&nn->nfsd_net_ref)); } void nfsd_net_put(struct net *net) __must_hold(rcu) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); percpu_ref_put(&nn->nfsd_net_ref); } static void nfsd_net_done(struct percpu_ref *ref) { struct nfsd_net *nn = container_of(ref, struct nfsd_net, nfsd_net_ref); complete(&nn->nfsd_net_confirm_done); } static void nfsd_net_free(struct percpu_ref *ref) { struct nfsd_net *nn = container_of(ref, struct nfsd_net, nfsd_net_ref); complete(&nn->nfsd_net_free_done); } /* * Maximum number of nfsd processes */ #define NFSD_MAXSERVS 8192 int nfsd_nrthreads(struct net *net) { int rv = 0; struct nfsd_net *nn = net_generic(net, nfsd_net_id); mutex_lock(&nfsd_mutex); if (nn->nfsd_serv) rv = nn->nfsd_serv->sv_nrthreads; mutex_unlock(&nfsd_mutex); return rv; } static int nfsd_init_socks(struct net *net, const struct cred *cred) { int error; struct nfsd_net *nn = net_generic(net, nfsd_net_id); if (!list_empty(&nn->nfsd_serv->sv_permsocks)) return 0; error = svc_xprt_create(nn->nfsd_serv, "udp", net, PF_INET, NFS_PORT, SVC_SOCK_DEFAULTS, cred); if (error < 0) return error; error = svc_xprt_create(nn->nfsd_serv, "tcp", net, PF_INET, NFS_PORT, SVC_SOCK_DEFAULTS, cred); if (error < 0) return error; return 0; } static int nfsd_users = 0; static int nfsd_startup_generic(void) { int ret; if (nfsd_users++) return 0; ret = nfsd_file_cache_init(); if (ret) goto dec_users; ret = nfs4_state_start(); if (ret) goto out_file_cache; return 0; out_file_cache: nfsd_file_cache_shutdown(); dec_users: nfsd_users--; return ret; } static void nfsd_shutdown_generic(void) { if (--nfsd_users) return; nfs4_state_shutdown(); nfsd_file_cache_shutdown(); } static bool nfsd_needs_lockd(struct nfsd_net *nn) { return nfsd_vers(nn, 2, NFSD_TEST) || nfsd_vers(nn, 3, NFSD_TEST); } /** * nfsd_copy_write_verifier - Atomically copy a write verifier * @verf: buffer in which to receive the verifier cookie * @nn: NFS net namespace * * This function provides a wait-free mechanism for copying the * namespace's write verifier without tearing it. */ void nfsd_copy_write_verifier(__be32 verf[2], struct nfsd_net *nn) { unsigned int seq; do { seq = read_seqbegin(&nn->writeverf_lock); memcpy(verf, nn->writeverf, sizeof(nn->writeverf)); } while (read_seqretry(&nn->writeverf_lock, seq)); } static void nfsd_reset_write_verifier_locked(struct nfsd_net *nn) { struct timespec64 now; u64 verf; /* * Because the time value is hashed, y2038 time_t overflow * is irrelevant in this usage. */ ktime_get_raw_ts64(&now); verf = siphash_2u64(now.tv_sec, now.tv_nsec, &nn->siphash_key); memcpy(nn->writeverf, &verf, sizeof(nn->writeverf)); } /** * nfsd_reset_write_verifier - Generate a new write verifier * @nn: NFS net namespace * * This function updates the ->writeverf field of @nn. This field * contains an opaque cookie that, according to Section 18.32.3 of * RFC 8881, "the client can use to determine whether a server has * changed instance state (e.g., server restart) between a call to * WRITE and a subsequent call to either WRITE or COMMIT. This * cookie MUST be unchanged during a single instance of the NFSv4.1 * server and MUST be unique between instances of the NFSv4.1 * server." */ void nfsd_reset_write_verifier(struct nfsd_net *nn) { write_seqlock(&nn->writeverf_lock); nfsd_reset_write_verifier_locked(nn); write_sequnlock(&nn->writeverf_lock); } /* * Crank up a set of per-namespace resources for a new NFSD instance, * including lockd, a duplicate reply cache, an open file cache * instance, and a cache of NFSv4 state objects. */ static int nfsd_startup_net(struct net *net, const struct cred *cred) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); int ret; if (nn->nfsd_net_up) return 0; ret = nfsd_startup_generic(); if (ret) return ret; ret = nfsd_init_socks(net, cred); if (ret) goto out_socks; if (nfsd_needs_lockd(nn) && !nn->lockd_up) { ret = lockd_up(net, cred); if (ret) goto out_socks; nn->lockd_up = true; } ret = nfsd_file_cache_start_net(net); if (ret) goto out_lockd; ret = nfsd_reply_cache_init(nn); if (ret) goto out_filecache; ret = nfs4_state_start_net(net); if (ret) goto out_reply_cache; #ifdef CONFIG_NFSD_V4_2_INTER_SSC nfsd4_ssc_init_umount_work(nn); #endif nn->nfsd_net_up = true; return 0; out_reply_cache: nfsd_reply_cache_shutdown(nn); out_filecache: nfsd_file_cache_shutdown_net(net); out_lockd: if (nn->lockd_up) { lockd_down(net); nn->lockd_up = false; } out_socks: nfsd_shutdown_generic(); return ret; } static void nfsd_shutdown_net(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); if (!nn->nfsd_net_up) return; percpu_ref_kill_and_confirm(&nn->nfsd_net_ref, nfsd_net_done); wait_for_completion(&nn->nfsd_net_confirm_done); nfsd_export_flush(net); nfs4_state_shutdown_net(net); nfsd_reply_cache_shutdown(nn); nfsd_file_cache_shutdown_net(net); if (nn->lockd_up) { lockd_down(net); nn->lockd_up = false; } wait_for_completion(&nn->nfsd_net_free_done); percpu_ref_exit(&nn->nfsd_net_ref); nn->nfsd_net_up = false; nfsd_shutdown_generic(); } static DEFINE_SPINLOCK(nfsd_notifier_lock); static int nfsd_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; struct net_device *dev = ifa->ifa_dev->dev; struct net *net = dev_net(dev); struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct sockaddr_in sin; if (event != NETDEV_DOWN || !nn->nfsd_serv) goto out; spin_lock(&nfsd_notifier_lock); if (nn->nfsd_serv) { dprintk("nfsd_inetaddr_event: removed %pI4\n", &ifa->ifa_local); sin.sin_family = AF_INET; sin.sin_addr.s_addr = ifa->ifa_local; svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin); } spin_unlock(&nfsd_notifier_lock); out: return NOTIFY_DONE; } static struct notifier_block nfsd_inetaddr_notifier = { .notifier_call = nfsd_inetaddr_event, }; #if IS_ENABLED(CONFIG_IPV6) static int nfsd_inet6addr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr; struct net_device *dev = ifa->idev->dev; struct net *net = dev_net(dev); struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct sockaddr_in6 sin6; if (event != NETDEV_DOWN || !nn->nfsd_serv) goto out; spin_lock(&nfsd_notifier_lock); if (nn->nfsd_serv) { dprintk("nfsd_inet6addr_event: removed %pI6\n", &ifa->addr); sin6.sin6_family = AF_INET6; sin6.sin6_addr = ifa->addr; if (ipv6_addr_type(&sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL) sin6.sin6_scope_id = ifa->idev->dev->ifindex; svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin6); } spin_unlock(&nfsd_notifier_lock); out: return NOTIFY_DONE; } static struct notifier_block nfsd_inet6addr_notifier = { .notifier_call = nfsd_inet6addr_event, }; #endif /* Only used under nfsd_mutex, so this atomic may be overkill: */ static atomic_t nfsd_notifier_refcount = ATOMIC_INIT(0); /** * nfsd_destroy_serv - tear down NFSD's svc_serv for a namespace * @net: network namespace the NFS service is associated with */ void nfsd_destroy_serv(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct svc_serv *serv = nn->nfsd_serv; lockdep_assert_held(&nfsd_mutex); spin_lock(&nfsd_notifier_lock); nn->nfsd_serv = NULL; spin_unlock(&nfsd_notifier_lock); /* check if the notifier still has clients */ if (atomic_dec_return(&nfsd_notifier_refcount) == 0) { unregister_inetaddr_notifier(&nfsd_inetaddr_notifier); #if IS_ENABLED(CONFIG_IPV6) unregister_inet6addr_notifier(&nfsd_inet6addr_notifier); #endif } svc_xprt_destroy_all(serv, net); /* * write_ports can create the server without actually starting * any threads--if we get shut down before any threads are * started, then nfsd_destroy_serv will be run before any of this * other initialization has been done except the rpcb information. */ svc_rpcb_cleanup(serv, net); nfsd_shutdown_net(net); svc_destroy(&serv); } void nfsd_reset_versions(struct nfsd_net *nn) { int i; for (i = 0; i <= NFSD_MAXVERS; i++) if (nfsd_vers(nn, i, NFSD_TEST)) return; for (i = 0; i <= NFSD_MAXVERS; i++) if (i != 4) nfsd_vers(nn, i, NFSD_SET); else { int minor = 0; while (nfsd_minorversion(nn, minor, NFSD_SET) >= 0) minor++; } } static int nfsd_get_default_max_blksize(void) { struct sysinfo i; unsigned long long target; unsigned long ret; si_meminfo(&i); target = (i.totalram - i.totalhigh) << PAGE_SHIFT; /* * Aim for 1/4096 of memory per thread This gives 1MB on 4Gig * machines, but only uses 32K on 128M machines. Bottom out at * 8K on 32M and smaller. Of course, this is only a default. */ target >>= 12; ret = NFSSVC_MAXBLKSIZE; while (ret > target && ret >= 8*1024*2) ret /= 2; return ret; } void nfsd_shutdown_threads(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct svc_serv *serv; mutex_lock(&nfsd_mutex); serv = nn->nfsd_serv; if (serv == NULL) { mutex_unlock(&nfsd_mutex); return; } /* Kill outstanding nfsd threads */ svc_set_num_threads(serv, NULL, 0); nfsd_destroy_serv(net); mutex_unlock(&nfsd_mutex); } struct svc_rqst *nfsd_current_rqst(void) { if (kthread_func(current) == nfsd) return kthread_data(current); return NULL; } int nfsd_create_serv(struct net *net) { int error; struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct svc_serv *serv; WARN_ON(!mutex_is_locked(&nfsd_mutex)); if (nn->nfsd_serv) return 0; error = percpu_ref_init(&nn->nfsd_net_ref, nfsd_net_free, 0, GFP_KERNEL); if (error) return error; init_completion(&nn->nfsd_net_free_done); init_completion(&nn->nfsd_net_confirm_done); if (nfsd_max_blksize == 0) nfsd_max_blksize = nfsd_get_default_max_blksize(); nfsd_reset_versions(nn); serv = svc_create_pooled(nfsd_programs, ARRAY_SIZE(nfsd_programs), &nn->nfsd_svcstats, nfsd_max_blksize, nfsd); if (serv == NULL) return -ENOMEM; error = svc_bind(serv, net); if (error < 0) { svc_destroy(&serv); return error; } spin_lock(&nfsd_notifier_lock); nn->nfsd_serv = serv; spin_unlock(&nfsd_notifier_lock); /* check if the notifier is already set */ if (atomic_inc_return(&nfsd_notifier_refcount) == 1) { register_inetaddr_notifier(&nfsd_inetaddr_notifier); #if IS_ENABLED(CONFIG_IPV6) register_inet6addr_notifier(&nfsd_inet6addr_notifier); #endif } nfsd_reset_write_verifier(nn); return 0; } int nfsd_nrpools(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); if (nn->nfsd_serv == NULL) return 0; else return nn->nfsd_serv->sv_nrpools; } int nfsd_get_nrthreads(int n, int *nthreads, struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct svc_serv *serv = nn->nfsd_serv; int i; if (serv) for (i = 0; i < serv->sv_nrpools && i < n; i++) nthreads[i] = serv->sv_pools[i].sp_nrthreads; return 0; } /** * nfsd_set_nrthreads - set the number of running threads in the net's service * @n: number of array members in @nthreads * @nthreads: array of thread counts for each pool * @net: network namespace to operate within * * This function alters the number of running threads for the given network * namespace in each pool. If passed an array longer then the number of pools * the extra pool settings are ignored. If passed an array shorter than the * number of pools, the missing values are interpreted as 0's. * * Returns 0 on success or a negative errno on error. */ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) { int i = 0; int tot = 0; int err = 0; struct nfsd_net *nn = net_generic(net, nfsd_net_id); lockdep_assert_held(&nfsd_mutex); if (nn->nfsd_serv == NULL || n <= 0) return 0; /* * Special case: When n == 1, pass in NULL for the pool, so that the * change is distributed equally among them. */ if (n == 1) return svc_set_num_threads(nn->nfsd_serv, NULL, nthreads[0]); if (n > nn->nfsd_serv->sv_nrpools) n = nn->nfsd_serv->sv_nrpools; /* enforce a global maximum number of threads */ tot = 0; for (i = 0; i < n; i++) { nthreads[i] = min(nthreads[i], NFSD_MAXSERVS); tot += nthreads[i]; } if (tot > NFSD_MAXSERVS) { /* total too large: scale down requested numbers */ for (i = 0; i < n && tot > 0; i++) { int new = nthreads[i] * NFSD_MAXSERVS / tot; tot -= (nthreads[i] - new); nthreads[i] = new; } for (i = 0; i < n && tot > 0; i++) { nthreads[i]--; tot--; } } /* apply the new numbers */ for (i = 0; i < n; i++) { err = svc_set_num_threads(nn->nfsd_serv, &nn->nfsd_serv->sv_pools[i], nthreads[i]); if (err) goto out; } /* Anything undefined in array is considered to be 0 */ for (i = n; i < nn->nfsd_serv->sv_nrpools; ++i) { err = svc_set_num_threads(nn->nfsd_serv, &nn->nfsd_serv->sv_pools[i], 0); if (err) goto out; } out: return err; } /** * nfsd_svc: start up or shut down the nfsd server * @n: number of array members in @nthreads * @nthreads: array of thread counts for each pool * @net: network namespace to operate within * @cred: credentials to use for xprt creation * @scope: server scope value (defaults to nodename) * * Adjust the number of threads in each pool and return the new * total number of threads in the service. */ int nfsd_svc(int n, int *nthreads, struct net *net, const struct cred *cred, const char *scope) { int error; struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct svc_serv *serv; lockdep_assert_held(&nfsd_mutex); dprintk("nfsd: creating service\n"); strscpy(nn->nfsd_name, scope ? scope : utsname()->nodename, sizeof(nn->nfsd_name)); error = nfsd_create_serv(net); if (error) goto out; serv = nn->nfsd_serv; error = nfsd_startup_net(net, cred); if (error) goto out_put; error = nfsd_set_nrthreads(n, nthreads, net); if (error) goto out_put; error = serv->sv_nrthreads; out_put: if (serv->sv_nrthreads == 0) nfsd_destroy_serv(net); out: return error; } #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) static bool nfsd_support_acl_version(int vers) { if (vers >= NFSD_ACL_MINVERS && vers < NFSD_ACL_NRVERS) return nfsd_acl_version[vers] != NULL; return false; } static int nfsd_acl_rpcbind_set(struct net *net, const struct svc_program *progp, u32 version, int family, unsigned short proto, unsigned short port) { if (!nfsd_support_acl_version(version) || !nfsd_vers(net_generic(net, nfsd_net_id), version, NFSD_TEST)) return 0; return svc_generic_rpcbind_set(net, progp, version, family, proto, port); } static __be32 nfsd_acl_init_request(struct svc_rqst *rqstp, const struct svc_program *progp, struct svc_process_info *ret) { struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); int i; if (likely(nfsd_support_acl_version(rqstp->rq_vers) && nfsd_vers(nn, rqstp->rq_vers, NFSD_TEST))) return svc_generic_init_request(rqstp, progp, ret); ret->mismatch.lovers = NFSD_ACL_NRVERS; for (i = NFSD_ACL_MINVERS; i < NFSD_ACL_NRVERS; i++) { if (nfsd_support_acl_version(rqstp->rq_vers) && nfsd_vers(nn, i, NFSD_TEST)) { ret->mismatch.lovers = i; break; } } if (ret->mismatch.lovers == NFSD_ACL_NRVERS) return rpc_prog_unavail; ret->mismatch.hivers = NFSD_ACL_MINVERS; for (i = NFSD_ACL_NRVERS - 1; i >= NFSD_ACL_MINVERS; i--) { if (nfsd_support_acl_version(rqstp->rq_vers) && nfsd_vers(nn, i, NFSD_TEST)) { ret->mismatch.hivers = i; break; } } return rpc_prog_mismatch; } #endif static int nfsd_rpcbind_set(struct net *net, const struct svc_program *progp, u32 version, int family, unsigned short proto, unsigned short port) { if (!nfsd_vers(net_generic(net, nfsd_net_id), version, NFSD_TEST)) return 0; return svc_generic_rpcbind_set(net, progp, version, family, proto, port); } static __be32 nfsd_init_request(struct svc_rqst *rqstp, const struct svc_program *progp, struct svc_process_info *ret) { struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); int i; if (likely(nfsd_vers(nn, rqstp->rq_vers, NFSD_TEST))) return svc_generic_init_request(rqstp, progp, ret); ret->mismatch.lovers = NFSD_MAXVERS + 1; for (i = NFSD_MINVERS; i <= NFSD_MAXVERS; i++) { if (nfsd_vers(nn, i, NFSD_TEST)) { ret->mismatch.lovers = i; break; } } if (ret->mismatch.lovers > NFSD_MAXVERS) return rpc_prog_unavail; ret->mismatch.hivers = NFSD_MINVERS; for (i = NFSD_MAXVERS; i >= NFSD_MINVERS; i--) { if (nfsd_vers(nn, i, NFSD_TEST)) { ret->mismatch.hivers = i; break; } } return rpc_prog_mismatch; } /* * This is the NFS server kernel thread */ static int nfsd(void *vrqstp) { struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp; struct svc_xprt *perm_sock = list_entry(rqstp->rq_server->sv_permsocks.next, typeof(struct svc_xprt), xpt_list); struct net *net = perm_sock->xpt_net; struct nfsd_net *nn = net_generic(net, nfsd_net_id); /* At this point, the thread shares current->fs * with the init process. We need to create files with the * umask as defined by the client instead of init's umask. */ svc_thread_init_status(rqstp, unshare_fs_struct()); current->fs->umask = 0; atomic_inc(&nfsd_th_cnt); set_freezable(); /* * The main request loop */ while (!svc_thread_should_stop(rqstp)) { svc_recv(rqstp); nfsd_file_net_dispose(nn); } atomic_dec(&nfsd_th_cnt); /* Release the thread */ svc_exit_thread(rqstp); return 0; } /** * nfsd_dispatch - Process an NFS or NFSACL or LOCALIO Request * @rqstp: incoming request * * This RPC dispatcher integrates the NFS server's duplicate reply cache. * * Return values: * %0: Processing complete; do not send a Reply * %1: Processing complete; send Reply in rqstp->rq_res */ int nfsd_dispatch(struct svc_rqst *rqstp) { const struct svc_procedure *proc = rqstp->rq_procinfo; __be32 *statp = rqstp->rq_accept_statp; struct nfsd_cacherep *rp; unsigned int start, len; __be32 *nfs_reply; /* * Give the xdr decoder a chance to change this if it wants * (necessary in the NFSv4.0 compound case) */ rqstp->rq_cachetype = proc->pc_cachetype; /* * ->pc_decode advances the argument stream past the NFS * Call header, so grab the header's starting location and * size now for the call to nfsd_cache_lookup(). */ start = xdr_stream_pos(&rqstp->rq_arg_stream); len = xdr_stream_remaining(&rqstp->rq_arg_stream); if (!proc->pc_decode(rqstp, &rqstp->rq_arg_stream)) goto out_decode_err; /* * Release rq_status_counter setting it to an odd value after the rpc * request has been properly parsed. rq_status_counter is used to * notify the consumers if the rqstp fields are stable * (rq_status_counter is odd) or not meaningful (rq_status_counter * is even). */ smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter | 1); rp = NULL; switch (nfsd_cache_lookup(rqstp, start, len, &rp)) { case RC_DOIT: break; case RC_REPLY: goto out_cached_reply; case RC_DROPIT: goto out_dropit; } nfs_reply = xdr_inline_decode(&rqstp->rq_res_stream, 0); *statp = proc->pc_func(rqstp); if (test_bit(RQ_DROPME, &rqstp->rq_flags)) goto out_update_drop; if (!proc->pc_encode(rqstp, &rqstp->rq_res_stream)) goto out_encode_err; /* * Release rq_status_counter setting it to an even value after the rpc * request has been properly processed. */ smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter + 1); nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, nfs_reply); out_cached_reply: return 1; out_decode_err: trace_nfsd_garbage_args_err(rqstp); *statp = rpc_garbage_args; return 1; out_update_drop: nfsd_cache_update(rqstp, rp, RC_NOCACHE, NULL); out_dropit: return 0; out_encode_err: trace_nfsd_cant_encode_err(rqstp); nfsd_cache_update(rqstp, rp, RC_NOCACHE, NULL); *statp = rpc_system_err; return 1; } /** * nfssvc_decode_voidarg - Decode void arguments * @rqstp: Server RPC transaction context * @xdr: XDR stream positioned at arguments to decode * * Return values: * %false: Arguments were not valid * %true: Decoding was successful */ bool nfssvc_decode_voidarg(struct svc_rqst *rqstp, struct xdr_stream *xdr) { return true; } /** * nfssvc_encode_voidres - Encode void results * @rqstp: Server RPC transaction context * @xdr: XDR stream into which to encode results * * Return values: * %false: Local error while encoding * %true: Encoding was successful */ bool nfssvc_encode_voidres(struct svc_rqst *rqstp, struct xdr_stream *xdr) { return true; }
9 9 1 1 10 10 10 10 1 1 6 4 24 4 17 4 14 14 12 36 36 6 25 5 1 3 24 6 1 5 5 5 5 5 1 4 4 1 4 1 5 63 64 3 1 1 60 58 60 62 63 63 4 1 1 1 1 7 7 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 6 2 2 2 8 8 8 8 8 8 8 8 8 8 8 3 5 8 8 8 11 11 11 8 8 8 8 8 8 8 8 8 8 7 1 8 8 8 8 8 3 3 3 3 3 3 3 3 3 3 3 2 1 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 5 2 3 10 10 9 3 2 1 1 2 2 2 2 2 2 2 2 2 1 1 26 25 2 1 12 58 3 1 3 26 20 7 26 1 1 20 6 21 3 2 17 2 4 16 5 18 3 6 6 6 2 4 1 1 5 5 5 2 5 119 119 116 5 5 6 6 6 6 9 1 8 6 6 6 6 6 1 1 1 1 1 1 1 1 19 15 5 7 12 12 12 12 12 3 3 3 7 1 6 3 3 7 7 7 7 7 7 1 5 1 3 3 7 7 7 2 18 18 16 16 56 4 54 4 53 40 32 3 32 3 118 118 118 118 5 99 40 1 118 75 72 3 75 75 1 1 136 136 137 137 137 137 137 136 135 137 146 146 14 104 28 144 4 4 4 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 // SPDX-License-Identifier: GPL-2.0-only /* * mac80211 configuration hooks for cfg80211 * * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2015 Intel Mobile Communications GmbH * Copyright (C) 2015-2017 Intel Deutschland GmbH * Copyright (C) 2018-2025 Intel Corporation */ #include <linux/ieee80211.h> #include <linux/nl80211.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <net/net_namespace.h> #include <linux/rcupdate.h> #include <linux/fips.h> #include <linux/if_ether.h> #include <net/cfg80211.h> #include "ieee80211_i.h" #include "driver-ops.h" #include "rate.h" #include "mesh.h" #include "wme.h" static struct ieee80211_link_data * ieee80211_link_or_deflink(struct ieee80211_sub_if_data *sdata, int link_id, bool require_valid) { struct ieee80211_link_data *link; if (link_id < 0) { /* * For keys, if sdata is not an MLD, we might not use * the return value at all (if it's not a pairwise key), * so in that case (require_valid==false) don't error. */ if (require_valid && ieee80211_vif_is_mld(&sdata->vif)) return ERR_PTR(-EINVAL); return &sdata->deflink; } link = sdata_dereference(sdata->link[link_id], sdata); if (!link) return ERR_PTR(-ENOLINK); return link; } static void ieee80211_set_mu_mimo_follow(struct ieee80211_sub_if_data *sdata, struct vif_params *params) { bool mu_mimo_groups = false; bool mu_mimo_follow = false; if (params->vht_mumimo_groups) { u64 membership; BUILD_BUG_ON(sizeof(membership) != WLAN_MEMBERSHIP_LEN); memcpy(sdata->vif.bss_conf.mu_group.membership, params->vht_mumimo_groups, WLAN_MEMBERSHIP_LEN); memcpy(sdata->vif.bss_conf.mu_group.position, params->vht_mumimo_groups + WLAN_MEMBERSHIP_LEN, WLAN_USER_POSITION_LEN); ieee80211_link_info_change_notify(sdata, &sdata->deflink, BSS_CHANGED_MU_GROUPS); /* don't care about endianness - just check for 0 */ memcpy(&membership, params->vht_mumimo_groups, WLAN_MEMBERSHIP_LEN); mu_mimo_groups = membership != 0; } if (params->vht_mumimo_follow_addr) { mu_mimo_follow = is_valid_ether_addr(params->vht_mumimo_follow_addr); ether_addr_copy(sdata->u.mntr.mu_follow_addr, params->vht_mumimo_follow_addr); } sdata->vif.bss_conf.mu_mimo_owner = mu_mimo_groups || mu_mimo_follow; } static int ieee80211_set_mon_options(struct ieee80211_sub_if_data *sdata, struct vif_params *params) { struct ieee80211_local *local = sdata->local; struct ieee80211_sub_if_data *monitor_sdata; /* check flags first */ if (params->flags && ieee80211_sdata_running(sdata)) { u32 mask = MONITOR_FLAG_ACTIVE; /* * Prohibit MONITOR_FLAG_ACTIVE to be changed * while the interface is up. * Else we would need to add a lot of cruft * to update everything: * monitor and all fif_* counters * reconfigure hardware */ if ((params->flags & mask) != (sdata->u.mntr.flags & mask)) return -EBUSY; } /* also validate MU-MIMO change */ if (ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) monitor_sdata = sdata; else monitor_sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata); if (!monitor_sdata && (params->vht_mumimo_groups || params->vht_mumimo_follow_addr)) return -EOPNOTSUPP; /* apply all changes now - no failures allowed */ if (monitor_sdata && (ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) || ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR))) ieee80211_set_mu_mimo_follow(monitor_sdata, params); if (params->flags) { if (ieee80211_sdata_running(sdata)) { ieee80211_adjust_monitor_flags(sdata, -1); sdata->u.mntr.flags = params->flags; ieee80211_adjust_monitor_flags(sdata, 1); ieee80211_configure_filter(local); } else { /* * Because the interface is down, ieee80211_do_stop * and ieee80211_do_open take care of "everything" * mentioned in the comment above. */ sdata->u.mntr.flags = params->flags; } } return 0; } static int ieee80211_set_ap_mbssid_options(struct ieee80211_sub_if_data *sdata, struct cfg80211_mbssid_config *params, struct ieee80211_bss_conf *link_conf) { struct ieee80211_sub_if_data *tx_sdata; struct ieee80211_bss_conf *old; link_conf->bssid_index = 0; link_conf->nontransmitted = false; link_conf->ema_ap = false; link_conf->bssid_indicator = 0; if (sdata->vif.type != NL80211_IFTYPE_AP || !params->tx_wdev) return -EINVAL; old = sdata_dereference(link_conf->tx_bss_conf, sdata); if (old) return -EALREADY; tx_sdata = IEEE80211_WDEV_TO_SUB_IF(params->tx_wdev); if (!tx_sdata) return -EINVAL; if (tx_sdata == sdata) { rcu_assign_pointer(link_conf->tx_bss_conf, link_conf); } else { struct ieee80211_bss_conf *tx_bss_conf; tx_bss_conf = sdata_dereference(tx_sdata->vif.link_conf[params->tx_link_id], sdata); if (rcu_access_pointer(tx_bss_conf->tx_bss_conf) != tx_bss_conf) return -EINVAL; rcu_assign_pointer(link_conf->tx_bss_conf, tx_bss_conf); link_conf->nontransmitted = true; link_conf->bssid_index = params->index; } if (params->ema) link_conf->ema_ap = true; return 0; } static struct wireless_dev *ieee80211_add_iface(struct wiphy *wiphy, const char *name, unsigned char name_assign_type, enum nl80211_iftype type, struct vif_params *params) { struct ieee80211_local *local = wiphy_priv(wiphy); struct wireless_dev *wdev; struct ieee80211_sub_if_data *sdata; int err; err = ieee80211_if_add(local, name, name_assign_type, &wdev, type, params); if (err) return ERR_PTR(err); sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); if (type == NL80211_IFTYPE_MONITOR) { err = ieee80211_set_mon_options(sdata, params); if (err) { ieee80211_if_remove(sdata); return NULL; } } /* Let the driver know that an interface is going to be added. * Indicate so only for interface types that will be added to the * driver. */ switch (type) { case NL80211_IFTYPE_AP_VLAN: break; case NL80211_IFTYPE_MONITOR: if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) || !(params->flags & MONITOR_FLAG_ACTIVE)) break; fallthrough; default: drv_prep_add_interface(local, ieee80211_vif_type_p2p(&sdata->vif)); break; } return wdev; } static int ieee80211_del_iface(struct wiphy *wiphy, struct wireless_dev *wdev) { ieee80211_if_remove(IEEE80211_WDEV_TO_SUB_IF(wdev)); return 0; } static int ieee80211_change_iface(struct wiphy *wiphy, struct net_device *dev, enum nl80211_iftype type, struct vif_params *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct sta_info *sta; int ret; lockdep_assert_wiphy(local->hw.wiphy); ret = ieee80211_if_change_type(sdata, type); if (ret) return ret; if (type == NL80211_IFTYPE_AP_VLAN && params->use_4addr == 0) { RCU_INIT_POINTER(sdata->u.vlan.sta, NULL); ieee80211_check_fast_rx_iface(sdata); } else if (type == NL80211_IFTYPE_STATION && params->use_4addr >= 0) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; if (params->use_4addr == ifmgd->use_4addr) return 0; /* FIXME: no support for 4-addr MLO yet */ if (ieee80211_vif_is_mld(&sdata->vif)) return -EOPNOTSUPP; sdata->u.mgd.use_4addr = params->use_4addr; if (!ifmgd->associated) return 0; sta = sta_info_get(sdata, sdata->deflink.u.mgd.bssid); if (sta) drv_sta_set_4addr(local, sdata, &sta->sta, params->use_4addr); if (params->use_4addr) ieee80211_send_4addr_nullfunc(local, sdata); } if (sdata->vif.type == NL80211_IFTYPE_MONITOR) { ret = ieee80211_set_mon_options(sdata, params); if (ret) return ret; } return 0; } static int ieee80211_start_p2p_device(struct wiphy *wiphy, struct wireless_dev *wdev) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); int ret; lockdep_assert_wiphy(sdata->local->hw.wiphy); ret = ieee80211_check_combinations(sdata, NULL, 0, 0, -1); if (ret < 0) return ret; return ieee80211_do_open(wdev, true); } static void ieee80211_stop_p2p_device(struct wiphy *wiphy, struct wireless_dev *wdev) { ieee80211_sdata_stop(IEEE80211_WDEV_TO_SUB_IF(wdev)); } static int ieee80211_start_nan(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_nan_conf *conf) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); int ret; lockdep_assert_wiphy(sdata->local->hw.wiphy); ret = ieee80211_check_combinations(sdata, NULL, 0, 0, -1); if (ret < 0) return ret; ret = ieee80211_do_open(wdev, true); if (ret) return ret; ret = drv_start_nan(sdata->local, sdata, conf); if (ret) ieee80211_sdata_stop(sdata); sdata->u.nan.conf = *conf; return ret; } static void ieee80211_stop_nan(struct wiphy *wiphy, struct wireless_dev *wdev) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); drv_stop_nan(sdata->local, sdata); ieee80211_sdata_stop(sdata); } static int ieee80211_nan_change_conf(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_nan_conf *conf, u32 changes) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); struct cfg80211_nan_conf new_conf; int ret = 0; if (sdata->vif.type != NL80211_IFTYPE_NAN) return -EOPNOTSUPP; if (!ieee80211_sdata_running(sdata)) return -ENETDOWN; new_conf = sdata->u.nan.conf; if (changes & CFG80211_NAN_CONF_CHANGED_PREF) new_conf.master_pref = conf->master_pref; if (changes & CFG80211_NAN_CONF_CHANGED_BANDS) new_conf.bands = conf->bands; ret = drv_nan_change_conf(sdata->local, sdata, &new_conf, changes); if (!ret) sdata->u.nan.conf = new_conf; return ret; } static int ieee80211_add_nan_func(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_nan_func *nan_func) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); int ret; if (sdata->vif.type != NL80211_IFTYPE_NAN) return -EOPNOTSUPP; if (!ieee80211_sdata_running(sdata)) return -ENETDOWN; spin_lock_bh(&sdata->u.nan.func_lock); ret = idr_alloc(&sdata->u.nan.function_inst_ids, nan_func, 1, sdata->local->hw.max_nan_de_entries + 1, GFP_ATOMIC); spin_unlock_bh(&sdata->u.nan.func_lock); if (ret < 0) return ret; nan_func->instance_id = ret; WARN_ON(nan_func->instance_id == 0); ret = drv_add_nan_func(sdata->local, sdata, nan_func); if (ret) { spin_lock_bh(&sdata->u.nan.func_lock); idr_remove(&sdata->u.nan.function_inst_ids, nan_func->instance_id); spin_unlock_bh(&sdata->u.nan.func_lock); } return ret; } static struct cfg80211_nan_func * ieee80211_find_nan_func_by_cookie(struct ieee80211_sub_if_data *sdata, u64 cookie) { struct cfg80211_nan_func *func; int id; lockdep_assert_held(&sdata->u.nan.func_lock); idr_for_each_entry(&sdata->u.nan.function_inst_ids, func, id) { if (func->cookie == cookie) return func; } return NULL; } static void ieee80211_del_nan_func(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); struct cfg80211_nan_func *func; u8 instance_id = 0; if (sdata->vif.type != NL80211_IFTYPE_NAN || !ieee80211_sdata_running(sdata)) return; spin_lock_bh(&sdata->u.nan.func_lock); func = ieee80211_find_nan_func_by_cookie(sdata, cookie); if (func) instance_id = func->instance_id; spin_unlock_bh(&sdata->u.nan.func_lock); if (instance_id) drv_del_nan_func(sdata->local, sdata, instance_id); } static int ieee80211_set_noack_map(struct wiphy *wiphy, struct net_device *dev, u16 noack_map) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); sdata->noack_map = noack_map; ieee80211_check_fast_xmit_iface(sdata); return 0; } static int ieee80211_set_tx(struct ieee80211_sub_if_data *sdata, const u8 *mac_addr, u8 key_idx) { struct ieee80211_local *local = sdata->local; struct ieee80211_key *key; struct sta_info *sta; int ret = -EINVAL; if (!wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_EXT_KEY_ID)) return -EINVAL; sta = sta_info_get_bss(sdata, mac_addr); if (!sta) return -EINVAL; if (sta->ptk_idx == key_idx) return 0; key = wiphy_dereference(local->hw.wiphy, sta->ptk[key_idx]); if (key && key->conf.flags & IEEE80211_KEY_FLAG_NO_AUTO_TX) ret = ieee80211_set_tx_key(key); return ret; } static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, int link_id, u8 key_idx, bool pairwise, const u8 *mac_addr, struct key_params *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_link_data *link = ieee80211_link_or_deflink(sdata, link_id, false); struct ieee80211_local *local = sdata->local; struct sta_info *sta = NULL; struct ieee80211_key *key; int err; lockdep_assert_wiphy(local->hw.wiphy); if (!ieee80211_sdata_running(sdata)) return -ENETDOWN; if (IS_ERR(link)) return PTR_ERR(link); if (WARN_ON(pairwise && link_id >= 0)) return -EINVAL; if (pairwise && params->mode == NL80211_KEY_SET_TX) return ieee80211_set_tx(sdata, mac_addr, key_idx); /* reject WEP and TKIP keys if WEP failed to initialize */ switch (params->cipher) { case WLAN_CIPHER_SUITE_WEP40: case WLAN_CIPHER_SUITE_TKIP: case WLAN_CIPHER_SUITE_WEP104: if (link_id >= 0) return -EINVAL; if (WARN_ON_ONCE(fips_enabled)) return -EINVAL; break; default: break; } key = ieee80211_key_alloc(params->cipher, key_idx, params->key_len, params->key, params->seq_len, params->seq); if (IS_ERR(key)) return PTR_ERR(key); if (pairwise) { key->conf.flags |= IEEE80211_KEY_FLAG_PAIRWISE; key->conf.link_id = -1; } else { key->conf.link_id = link->link_id; } if (params->mode == NL80211_KEY_NO_TX) key->conf.flags |= IEEE80211_KEY_FLAG_NO_AUTO_TX; if (mac_addr) { sta = sta_info_get_bss(sdata, mac_addr); /* * The ASSOC test makes sure the driver is ready to * receive the key. When wpa_supplicant has roamed * using FT, it attempts to set the key before * association has completed, this rejects that attempt * so it will set the key again after association. * * TODO: accept the key if we have a station entry and * add it to the device after the station. */ if (!sta || !test_sta_flag(sta, WLAN_STA_ASSOC)) { ieee80211_key_free_unused(key); return -ENOENT; } } switch (sdata->vif.type) { case NL80211_IFTYPE_STATION: if (sdata->u.mgd.mfp != IEEE80211_MFP_DISABLED) key->conf.flags |= IEEE80211_KEY_FLAG_RX_MGMT; break; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: /* Keys without a station are used for TX only */ if (sta && test_sta_flag(sta, WLAN_STA_MFP)) key->conf.flags |= IEEE80211_KEY_FLAG_RX_MGMT; break; case NL80211_IFTYPE_ADHOC: /* no MFP (yet) */ break; case NL80211_IFTYPE_MESH_POINT: #ifdef CONFIG_MAC80211_MESH if (sdata->u.mesh.security != IEEE80211_MESH_SEC_NONE) key->conf.flags |= IEEE80211_KEY_FLAG_RX_MGMT; break; #endif case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_P2P_DEVICE: case NL80211_IFTYPE_NAN: case NL80211_IFTYPE_UNSPECIFIED: case NUM_NL80211_IFTYPES: case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_P2P_GO: case NL80211_IFTYPE_OCB: /* shouldn't happen */ WARN_ON_ONCE(1); break; } err = ieee80211_key_link(key, link, sta); /* KRACK protection, shouldn't happen but just silently accept key */ if (err == -EALREADY) err = 0; return err; } static struct ieee80211_key * ieee80211_lookup_key(struct ieee80211_sub_if_data *sdata, int link_id, u8 key_idx, bool pairwise, const u8 *mac_addr) { struct ieee80211_local *local __maybe_unused = sdata->local; struct ieee80211_link_data *link = &sdata->deflink; struct ieee80211_key *key; if (link_id >= 0) { link = sdata_dereference(sdata->link[link_id], sdata); if (!link) return NULL; } if (mac_addr) { struct sta_info *sta; struct link_sta_info *link_sta; sta = sta_info_get_bss(sdata, mac_addr); if (!sta) return NULL; if (link_id >= 0) { link_sta = rcu_dereference_check(sta->link[link_id], lockdep_is_held(&local->hw.wiphy->mtx)); if (!link_sta) return NULL; } else { link_sta = &sta->deflink; } if (pairwise && key_idx < NUM_DEFAULT_KEYS) return wiphy_dereference(local->hw.wiphy, sta->ptk[key_idx]); if (!pairwise && key_idx < NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS + NUM_DEFAULT_BEACON_KEYS) return wiphy_dereference(local->hw.wiphy, link_sta->gtk[key_idx]); return NULL; } if (pairwise && key_idx < NUM_DEFAULT_KEYS) return wiphy_dereference(local->hw.wiphy, sdata->keys[key_idx]); key = wiphy_dereference(local->hw.wiphy, link->gtk[key_idx]); if (key) return key; /* or maybe it was a WEP key */ if (key_idx < NUM_DEFAULT_KEYS) return wiphy_dereference(local->hw.wiphy, sdata->keys[key_idx]); return NULL; } static int ieee80211_del_key(struct wiphy *wiphy, struct net_device *dev, int link_id, u8 key_idx, bool pairwise, const u8 *mac_addr) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct ieee80211_key *key; lockdep_assert_wiphy(local->hw.wiphy); key = ieee80211_lookup_key(sdata, link_id, key_idx, pairwise, mac_addr); if (!key) return -ENOENT; ieee80211_key_free(key, sdata->vif.type == NL80211_IFTYPE_STATION); return 0; } static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev, int link_id, u8 key_idx, bool pairwise, const u8 *mac_addr, void *cookie, void (*callback)(void *cookie, struct key_params *params)) { struct ieee80211_sub_if_data *sdata; u8 seq[6] = {0}; struct key_params params; struct ieee80211_key *key; u64 pn64; u32 iv32; u16 iv16; int err = -ENOENT; struct ieee80211_key_seq kseq = {}; sdata = IEEE80211_DEV_TO_SUB_IF(dev); rcu_read_lock(); key = ieee80211_lookup_key(sdata, link_id, key_idx, pairwise, mac_addr); if (!key) goto out; memset(&params, 0, sizeof(params)); params.cipher = key->conf.cipher; switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_TKIP: pn64 = atomic64_read(&key->conf.tx_pn); iv32 = TKIP_PN_TO_IV32(pn64); iv16 = TKIP_PN_TO_IV16(pn64); if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE && !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) { drv_get_key_seq(sdata->local, key, &kseq); iv32 = kseq.tkip.iv32; iv16 = kseq.tkip.iv16; } seq[0] = iv16 & 0xff; seq[1] = (iv16 >> 8) & 0xff; seq[2] = iv32 & 0xff; seq[3] = (iv32 >> 8) & 0xff; seq[4] = (iv32 >> 16) & 0xff; seq[5] = (iv32 >> 24) & 0xff; params.seq = seq; params.seq_len = 6; break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: BUILD_BUG_ON(offsetof(typeof(kseq), ccmp) != offsetof(typeof(kseq), aes_cmac)); fallthrough; case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: BUILD_BUG_ON(offsetof(typeof(kseq), ccmp) != offsetof(typeof(kseq), aes_gmac)); fallthrough; case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: BUILD_BUG_ON(offsetof(typeof(kseq), ccmp) != offsetof(typeof(kseq), gcmp)); if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE && !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) { drv_get_key_seq(sdata->local, key, &kseq); memcpy(seq, kseq.ccmp.pn, 6); } else { pn64 = atomic64_read(&key->conf.tx_pn); seq[0] = pn64; seq[1] = pn64 >> 8; seq[2] = pn64 >> 16; seq[3] = pn64 >> 24; seq[4] = pn64 >> 32; seq[5] = pn64 >> 40; } params.seq = seq; params.seq_len = 6; break; default: if (!(key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)) break; if (WARN_ON(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) break; drv_get_key_seq(sdata->local, key, &kseq); params.seq = kseq.hw.seq; params.seq_len = kseq.hw.seq_len; break; } callback(cookie, &params); err = 0; out: rcu_read_unlock(); return err; } static int ieee80211_config_default_key(struct wiphy *wiphy, struct net_device *dev, int link_id, u8 key_idx, bool uni, bool multi) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_link_data *link = ieee80211_link_or_deflink(sdata, link_id, false); if (IS_ERR(link)) return PTR_ERR(link); ieee80211_set_default_key(link, key_idx, uni, multi); return 0; } static int ieee80211_config_default_mgmt_key(struct wiphy *wiphy, struct net_device *dev, int link_id, u8 key_idx) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_link_data *link = ieee80211_link_or_deflink(sdata, link_id, true); if (IS_ERR(link)) return PTR_ERR(link); ieee80211_set_default_mgmt_key(link, key_idx); return 0; } static int ieee80211_config_default_beacon_key(struct wiphy *wiphy, struct net_device *dev, int link_id, u8 key_idx) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_link_data *link = ieee80211_link_or_deflink(sdata, link_id, true); if (IS_ERR(link)) return PTR_ERR(link); ieee80211_set_default_beacon_key(link, key_idx); return 0; } void sta_set_rate_info_tx(struct sta_info *sta, const struct ieee80211_tx_rate *rate, struct rate_info *rinfo) { rinfo->flags = 0; if (rate->flags & IEEE80211_TX_RC_MCS) { rinfo->flags |= RATE_INFO_FLAGS_MCS; rinfo->mcs = rate->idx; } else if (rate->flags & IEEE80211_TX_RC_VHT_MCS) { rinfo->flags |= RATE_INFO_FLAGS_VHT_MCS; rinfo->mcs = ieee80211_rate_get_vht_mcs(rate); rinfo->nss = ieee80211_rate_get_vht_nss(rate); } else { struct ieee80211_supported_band *sband; sband = ieee80211_get_sband(sta->sdata); WARN_ON_ONCE(sband && !sband->bitrates); if (sband && sband->bitrates) rinfo->legacy = sband->bitrates[rate->idx].bitrate; } if (rate->flags & IEEE80211_TX_RC_40_MHZ_WIDTH) rinfo->bw = RATE_INFO_BW_40; else if (rate->flags & IEEE80211_TX_RC_80_MHZ_WIDTH) rinfo->bw = RATE_INFO_BW_80; else if (rate->flags & IEEE80211_TX_RC_160_MHZ_WIDTH) rinfo->bw = RATE_INFO_BW_160; else rinfo->bw = RATE_INFO_BW_20; if (rate->flags & IEEE80211_TX_RC_SHORT_GI) rinfo->flags |= RATE_INFO_FLAGS_SHORT_GI; } static int ieee80211_dump_station(struct wiphy *wiphy, struct net_device *dev, int idx, u8 *mac, struct station_info *sinfo) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct sta_info *sta; int ret = -ENOENT; lockdep_assert_wiphy(local->hw.wiphy); sta = sta_info_get_by_idx(sdata, idx); if (sta) { ret = 0; memcpy(mac, sta->sta.addr, ETH_ALEN); sta_set_sinfo(sta, sinfo, true); } return ret; } static int ieee80211_dump_survey(struct wiphy *wiphy, struct net_device *dev, int idx, struct survey_info *survey) { struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); return drv_get_survey(local, idx, survey); } static int ieee80211_get_station(struct wiphy *wiphy, struct net_device *dev, const u8 *mac, struct station_info *sinfo) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct sta_info *sta; int ret = -ENOENT; lockdep_assert_wiphy(local->hw.wiphy); sta = sta_info_get_bss(sdata, mac); if (sta) { ret = 0; sta_set_sinfo(sta, sinfo, true); } return ret; } static int ieee80211_set_monitor_channel(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_chan_def *chandef) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata; struct ieee80211_chan_req chanreq = { .oper = *chandef }; int ret; lockdep_assert_wiphy(local->hw.wiphy); sdata = IEEE80211_DEV_TO_SUB_IF(dev); if (!ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) { if (cfg80211_chandef_identical(&local->monitor_chanreq.oper, &chanreq.oper)) return 0; sdata = wiphy_dereference(wiphy, local->monitor_sdata); if (!sdata) goto done; } if (rcu_access_pointer(sdata->deflink.conf->chanctx_conf) && cfg80211_chandef_identical(&sdata->vif.bss_conf.chanreq.oper, &chanreq.oper)) return 0; ieee80211_link_release_channel(&sdata->deflink); ret = ieee80211_link_use_channel(&sdata->deflink, &chanreq, IEEE80211_CHANCTX_SHARED); if (ret) return ret; done: local->monitor_chanreq = chanreq; return 0; } static int ieee80211_set_probe_resp(struct ieee80211_sub_if_data *sdata, const u8 *resp, size_t resp_len, const struct ieee80211_csa_settings *csa, const struct ieee80211_color_change_settings *cca, struct ieee80211_link_data *link) { struct probe_resp *new, *old; if (!resp || !resp_len) return 1; old = sdata_dereference(link->u.ap.probe_resp, sdata); new = kzalloc(sizeof(struct probe_resp) + resp_len, GFP_KERNEL); if (!new) return -ENOMEM; new->len = resp_len; memcpy(new->data, resp, resp_len); if (csa) memcpy(new->cntdwn_counter_offsets, csa->counter_offsets_presp, csa->n_counter_offsets_presp * sizeof(new->cntdwn_counter_offsets[0])); else if (cca) new->cntdwn_counter_offsets[0] = cca->counter_offset_presp; rcu_assign_pointer(link->u.ap.probe_resp, new); if (old) kfree_rcu(old, rcu_head); return 0; } static int ieee80211_set_fils_discovery(struct ieee80211_sub_if_data *sdata, struct cfg80211_fils_discovery *params, struct ieee80211_link_data *link, struct ieee80211_bss_conf *link_conf, u64 *changed) { struct fils_discovery_data *new, *old = NULL; struct ieee80211_fils_discovery *fd; if (!params->update) return 0; fd = &link_conf->fils_discovery; fd->min_interval = params->min_interval; fd->max_interval = params->max_interval; old = sdata_dereference(link->u.ap.fils_discovery, sdata); if (old) kfree_rcu(old, rcu_head); if (params->tmpl && params->tmpl_len) { new = kzalloc(sizeof(*new) + params->tmpl_len, GFP_KERNEL); if (!new) return -ENOMEM; new->len = params->tmpl_len; memcpy(new->data, params->tmpl, params->tmpl_len); rcu_assign_pointer(link->u.ap.fils_discovery, new); } else { RCU_INIT_POINTER(link->u.ap.fils_discovery, NULL); } *changed |= BSS_CHANGED_FILS_DISCOVERY; return 0; } static int ieee80211_set_unsol_bcast_probe_resp(struct ieee80211_sub_if_data *sdata, struct cfg80211_unsol_bcast_probe_resp *params, struct ieee80211_link_data *link, struct ieee80211_bss_conf *link_conf, u64 *changed) { struct unsol_bcast_probe_resp_data *new, *old = NULL; if (!params->update) return 0; link_conf->unsol_bcast_probe_resp_interval = params->interval; old = sdata_dereference(link->u.ap.unsol_bcast_probe_resp, sdata); if (old) kfree_rcu(old, rcu_head); if (params->tmpl && params->tmpl_len) { new = kzalloc(sizeof(*new) + params->tmpl_len, GFP_KERNEL); if (!new) return -ENOMEM; new->len = params->tmpl_len; memcpy(new->data, params->tmpl, params->tmpl_len); rcu_assign_pointer(link->u.ap.unsol_bcast_probe_resp, new); } else { RCU_INIT_POINTER(link->u.ap.unsol_bcast_probe_resp, NULL); } *changed |= BSS_CHANGED_UNSOL_BCAST_PROBE_RESP; return 0; } static int ieee80211_set_ftm_responder_params( struct ieee80211_sub_if_data *sdata, const u8 *lci, size_t lci_len, const u8 *civicloc, size_t civicloc_len, struct ieee80211_bss_conf *link_conf) { struct ieee80211_ftm_responder_params *new, *old; u8 *pos; int len; if (!lci_len && !civicloc_len) return 0; old = link_conf->ftmr_params; len = lci_len + civicloc_len; new = kzalloc(sizeof(*new) + len, GFP_KERNEL); if (!new) return -ENOMEM; pos = (u8 *)(new + 1); if (lci_len) { new->lci_len = lci_len; new->lci = pos; memcpy(pos, lci, lci_len); pos += lci_len; } if (civicloc_len) { new->civicloc_len = civicloc_len; new->civicloc = pos; memcpy(pos, civicloc, civicloc_len); pos += civicloc_len; } link_conf->ftmr_params = new; kfree(old); return 0; } static int ieee80211_copy_mbssid_beacon(u8 *pos, struct cfg80211_mbssid_elems *dst, struct cfg80211_mbssid_elems *src) { int i, offset = 0; dst->cnt = src->cnt; for (i = 0; i < src->cnt; i++) { memcpy(pos + offset, src->elem[i].data, src->elem[i].len); dst->elem[i].len = src->elem[i].len; dst->elem[i].data = pos + offset; offset += dst->elem[i].len; } return offset; } static int ieee80211_copy_rnr_beacon(u8 *pos, struct cfg80211_rnr_elems *dst, struct cfg80211_rnr_elems *src) { int i, offset = 0; for (i = 0; i < src->cnt; i++) { memcpy(pos + offset, src->elem[i].data, src->elem[i].len); dst->elem[i].len = src->elem[i].len; dst->elem[i].data = pos + offset; offset += dst->elem[i].len; } dst->cnt = src->cnt; return offset; } static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link, struct cfg80211_beacon_data *params, const struct ieee80211_csa_settings *csa, const struct ieee80211_color_change_settings *cca, u64 *changed) { struct cfg80211_mbssid_elems *mbssid = NULL; struct cfg80211_rnr_elems *rnr = NULL; struct beacon_data *new, *old; int new_head_len, new_tail_len; int size, err; u64 _changed = BSS_CHANGED_BEACON; struct ieee80211_bss_conf *link_conf = link->conf; old = sdata_dereference(link->u.ap.beacon, sdata); /* Need to have a beacon head if we don't have one yet */ if (!params->head && !old) return -EINVAL; /* new or old head? */ if (params->head) new_head_len = params->head_len; else new_head_len = old->head_len; /* new or old tail? */ if (params->tail || !old) /* params->tail_len will be zero for !params->tail */ new_tail_len = params->tail_len; else new_tail_len = old->tail_len; size = sizeof(*new) + new_head_len + new_tail_len; /* new or old multiple BSSID elements? */ if (params->mbssid_ies) { mbssid = params->mbssid_ies; size += struct_size(new->mbssid_ies, elem, mbssid->cnt); if (params->rnr_ies) { rnr = params->rnr_ies; size += struct_size(new->rnr_ies, elem, rnr->cnt); } size += ieee80211_get_mbssid_beacon_len(mbssid, rnr, mbssid->cnt); } else if (old && old->mbssid_ies) { mbssid = old->mbssid_ies; size += struct_size(new->mbssid_ies, elem, mbssid->cnt); if (old && old->rnr_ies) { rnr = old->rnr_ies; size += struct_size(new->rnr_ies, elem, rnr->cnt); } size += ieee80211_get_mbssid_beacon_len(mbssid, rnr, mbssid->cnt); } new = kzalloc(size, GFP_KERNEL); if (!new) return -ENOMEM; /* start filling the new info now */ /* * pointers go into the block we allocated, * memory is | beacon_data | head | tail | mbssid_ies | rnr_ies */ new->head = ((u8 *) new) + sizeof(*new); new->tail = new->head + new_head_len; new->head_len = new_head_len; new->tail_len = new_tail_len; /* copy in optional mbssid_ies */ if (mbssid) { u8 *pos = new->tail + new->tail_len; new->mbssid_ies = (void *)pos; pos += struct_size(new->mbssid_ies, elem, mbssid->cnt); pos += ieee80211_copy_mbssid_beacon(pos, new->mbssid_ies, mbssid); if (rnr) { new->rnr_ies = (void *)pos; pos += struct_size(new->rnr_ies, elem, rnr->cnt); ieee80211_copy_rnr_beacon(pos, new->rnr_ies, rnr); } /* update bssid_indicator */ link_conf->bssid_indicator = ilog2(__roundup_pow_of_two(mbssid->cnt + 1)); } if (csa) { new->cntdwn_current_counter = csa->count; memcpy(new->cntdwn_counter_offsets, csa->counter_offsets_beacon, csa->n_counter_offsets_beacon * sizeof(new->cntdwn_counter_offsets[0])); } else if (cca) { new->cntdwn_current_counter = cca->count; new->cntdwn_counter_offsets[0] = cca->counter_offset_beacon; } /* copy in head */ if (params->head) memcpy(new->head, params->head, new_head_len); else memcpy(new->head, old->head, new_head_len); /* copy in optional tail */ if (params->tail) memcpy(new->tail, params->tail, new_tail_len); else if (old) memcpy(new->tail, old->tail, new_tail_len); err = ieee80211_set_probe_resp(sdata, params->probe_resp, params->probe_resp_len, csa, cca, link); if (err < 0) { kfree(new); return err; } if (err == 0) _changed |= BSS_CHANGED_AP_PROBE_RESP; if (params->ftm_responder != -1) { link_conf->ftm_responder = params->ftm_responder; err = ieee80211_set_ftm_responder_params(sdata, params->lci, params->lci_len, params->civicloc, params->civicloc_len, link_conf); if (err < 0) { kfree(new); return err; } _changed |= BSS_CHANGED_FTM_RESPONDER; } rcu_assign_pointer(link->u.ap.beacon, new); sdata->u.ap.active = true; if (old) kfree_rcu(old, rcu_head); *changed |= _changed; return 0; } static u8 ieee80211_num_beaconing_links(struct ieee80211_sub_if_data *sdata) { struct ieee80211_link_data *link; u8 link_id, num = 0; if (sdata->vif.type != NL80211_IFTYPE_AP && sdata->vif.type != NL80211_IFTYPE_P2P_GO) return num; if (!sdata->vif.valid_links) return num; for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { link = sdata_dereference(sdata->link[link_id], sdata); if (!link) continue; if (sdata_dereference(link->u.ap.beacon, sdata)) num++; } return num; } static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_ap_settings *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct beacon_data *old; struct ieee80211_sub_if_data *vlan; u64 changed = BSS_CHANGED_BEACON_INT | BSS_CHANGED_BEACON_ENABLED | BSS_CHANGED_BEACON | BSS_CHANGED_P2P_PS | BSS_CHANGED_TXPOWER | BSS_CHANGED_TWT; int i, err; int prev_beacon_int; unsigned int link_id = params->beacon.link_id; struct ieee80211_link_data *link; struct ieee80211_bss_conf *link_conf; struct ieee80211_chan_req chanreq = { .oper = params->chandef }; lockdep_assert_wiphy(local->hw.wiphy); link = sdata_dereference(sdata->link[link_id], sdata); if (!link) return -ENOLINK; link_conf = link->conf; old = sdata_dereference(link->u.ap.beacon, sdata); if (old) return -EALREADY; link->smps_mode = IEEE80211_SMPS_OFF; link->needed_rx_chains = sdata->local->rx_chains; prev_beacon_int = link_conf->beacon_int; link_conf->beacon_int = params->beacon_interval; if (params->ht_cap) link_conf->ht_ldpc = params->ht_cap->cap_info & cpu_to_le16(IEEE80211_HT_CAP_LDPC_CODING); if (params->vht_cap) { link_conf->vht_ldpc = params->vht_cap->vht_cap_info & cpu_to_le32(IEEE80211_VHT_CAP_RXLDPC); link_conf->vht_su_beamformer = params->vht_cap->vht_cap_info & cpu_to_le32(IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE); link_conf->vht_su_beamformee = params->vht_cap->vht_cap_info & cpu_to_le32(IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE); link_conf->vht_mu_beamformer = params->vht_cap->vht_cap_info & cpu_to_le32(IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE); link_conf->vht_mu_beamformee = params->vht_cap->vht_cap_info & cpu_to_le32(IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE); } if (params->he_cap && params->he_oper) { link_conf->he_support = true; link_conf->htc_trig_based_pkt_ext = le32_get_bits(params->he_oper->he_oper_params, IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK); link_conf->frame_time_rts_th = le32_get_bits(params->he_oper->he_oper_params, IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK); changed |= BSS_CHANGED_HE_OBSS_PD; if (params->beacon.he_bss_color.enabled) changed |= BSS_CHANGED_HE_BSS_COLOR; } if (params->he_cap) { link_conf->he_ldpc = params->he_cap->phy_cap_info[1] & IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD; link_conf->he_su_beamformer = params->he_cap->phy_cap_info[3] & IEEE80211_HE_PHY_CAP3_SU_BEAMFORMER; link_conf->he_su_beamformee = params->he_cap->phy_cap_info[4] & IEEE80211_HE_PHY_CAP4_SU_BEAMFORMEE; link_conf->he_mu_beamformer = params->he_cap->phy_cap_info[4] & IEEE80211_HE_PHY_CAP4_MU_BEAMFORMER; link_conf->he_full_ul_mumimo = params->he_cap->phy_cap_info[2] & IEEE80211_HE_PHY_CAP2_UL_MU_FULL_MU_MIMO; } if (params->eht_cap) { if (!link_conf->he_support) return -EOPNOTSUPP; link_conf->eht_support = true; link_conf->eht_su_beamformer = params->eht_cap->fixed.phy_cap_info[0] & IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMER; link_conf->eht_su_beamformee = params->eht_cap->fixed.phy_cap_info[0] & IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMEE; link_conf->eht_mu_beamformer = params->eht_cap->fixed.phy_cap_info[7] & (IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_80MHZ | IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_160MHZ | IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_320MHZ); link_conf->eht_80mhz_full_bw_ul_mumimo = params->eht_cap->fixed.phy_cap_info[7] & (IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_80MHZ | IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_160MHZ | IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_320MHZ); } else { link_conf->eht_su_beamformer = false; link_conf->eht_su_beamformee = false; link_conf->eht_mu_beamformer = false; } if (sdata->vif.type == NL80211_IFTYPE_AP && params->mbssid_config.tx_wdev) { err = ieee80211_set_ap_mbssid_options(sdata, &params->mbssid_config, link_conf); if (err) return err; } err = ieee80211_link_use_channel(link, &chanreq, IEEE80211_CHANCTX_SHARED); if (!err) ieee80211_link_copy_chanctx_to_vlans(link, false); if (err) { link_conf->beacon_int = prev_beacon_int; return err; } /* * Apply control port protocol, this allows us to * not encrypt dynamic WEP control frames. */ sdata->control_port_protocol = params->crypto.control_port_ethertype; sdata->control_port_no_encrypt = params->crypto.control_port_no_encrypt; sdata->control_port_over_nl80211 = params->crypto.control_port_over_nl80211; sdata->control_port_no_preauth = params->crypto.control_port_no_preauth; list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) { vlan->control_port_protocol = params->crypto.control_port_ethertype; vlan->control_port_no_encrypt = params->crypto.control_port_no_encrypt; vlan->control_port_over_nl80211 = params->crypto.control_port_over_nl80211; vlan->control_port_no_preauth = params->crypto.control_port_no_preauth; } link_conf->dtim_period = params->dtim_period; link_conf->enable_beacon = true; link_conf->allow_p2p_go_ps = sdata->vif.p2p; link_conf->twt_responder = params->twt_responder; link_conf->he_obss_pd = params->he_obss_pd; link_conf->he_bss_color = params->beacon.he_bss_color; sdata->vif.cfg.s1g = params->chandef.chan->band == NL80211_BAND_S1GHZ; sdata->vif.cfg.ssid_len = params->ssid_len; if (params->ssid_len) memcpy(sdata->vif.cfg.ssid, params->ssid, params->ssid_len); link_conf->hidden_ssid = (params->hidden_ssid != NL80211_HIDDEN_SSID_NOT_IN_USE); memset(&link_conf->p2p_noa_attr, 0, sizeof(link_conf->p2p_noa_attr)); link_conf->p2p_noa_attr.oppps_ctwindow = params->p2p_ctwindow & IEEE80211_P2P_OPPPS_CTWINDOW_MASK; if (params->p2p_opp_ps) link_conf->p2p_noa_attr.oppps_ctwindow |= IEEE80211_P2P_OPPPS_ENABLE_BIT; sdata->beacon_rate_set = false; if (wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_BEACON_RATE_LEGACY)) { for (i = 0; i < NUM_NL80211_BANDS; i++) { sdata->beacon_rateidx_mask[i] = params->beacon_rate.control[i].legacy; if (sdata->beacon_rateidx_mask[i]) sdata->beacon_rate_set = true; } } if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) link_conf->beacon_tx_rate = params->beacon_rate; err = ieee80211_assign_beacon(sdata, link, &params->beacon, NULL, NULL, &changed); if (err < 0) goto error; err = ieee80211_set_fils_discovery(sdata, &params->fils_discovery, link, link_conf, &changed); if (err < 0) goto error; err = ieee80211_set_unsol_bcast_probe_resp(sdata, &params->unsol_bcast_probe_resp, link, link_conf, &changed); if (err < 0) goto error; err = drv_start_ap(sdata->local, sdata, link_conf); if (err) { old = sdata_dereference(link->u.ap.beacon, sdata); if (old) kfree_rcu(old, rcu_head); RCU_INIT_POINTER(link->u.ap.beacon, NULL); if (ieee80211_num_beaconing_links(sdata) == 0) sdata->u.ap.active = false; goto error; } ieee80211_recalc_dtim(local, sdata); ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_SSID); ieee80211_link_info_change_notify(sdata, link, changed); if (ieee80211_num_beaconing_links(sdata) <= 1) netif_carrier_on(dev); list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) netif_carrier_on(vlan->dev); return 0; error: ieee80211_link_release_channel(link); return err; } static int ieee80211_change_beacon(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_ap_update *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_link_data *link; struct cfg80211_beacon_data *beacon = &params->beacon; struct beacon_data *old; int err; struct ieee80211_bss_conf *link_conf; u64 changed = 0; lockdep_assert_wiphy(wiphy); link = sdata_dereference(sdata->link[beacon->link_id], sdata); if (!link) return -ENOLINK; link_conf = link->conf; /* don't allow changing the beacon while a countdown is in place - offset * of channel switch counter may change */ if (link_conf->csa_active || link_conf->color_change_active) return -EBUSY; old = sdata_dereference(link->u.ap.beacon, sdata); if (!old) return -ENOENT; err = ieee80211_assign_beacon(sdata, link, beacon, NULL, NULL, &changed); if (err < 0) return err; err = ieee80211_set_fils_discovery(sdata, &params->fils_discovery, link, link_conf, &changed); if (err < 0) return err; err = ieee80211_set_unsol_bcast_probe_resp(sdata, &params->unsol_bcast_probe_resp, link, link_conf, &changed); if (err < 0) return err; if (beacon->he_bss_color_valid && beacon->he_bss_color.enabled != link_conf->he_bss_color.enabled) { link_conf->he_bss_color.enabled = beacon->he_bss_color.enabled; changed |= BSS_CHANGED_HE_BSS_COLOR; } ieee80211_link_info_change_notify(sdata, link, changed); return 0; } static void ieee80211_free_next_beacon(struct ieee80211_link_data *link) { if (!link->u.ap.next_beacon) return; kfree(link->u.ap.next_beacon->mbssid_ies); kfree(link->u.ap.next_beacon->rnr_ies); kfree(link->u.ap.next_beacon); link->u.ap.next_beacon = NULL; } static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_sub_if_data *vlan; struct ieee80211_local *local = sdata->local; struct beacon_data *old_beacon; struct probe_resp *old_probe_resp; struct fils_discovery_data *old_fils_discovery; struct unsol_bcast_probe_resp_data *old_unsol_bcast_probe_resp; struct cfg80211_chan_def chandef; struct ieee80211_link_data *link = sdata_dereference(sdata->link[link_id], sdata); struct ieee80211_bss_conf *link_conf = link->conf; LIST_HEAD(keys); lockdep_assert_wiphy(local->hw.wiphy); old_beacon = sdata_dereference(link->u.ap.beacon, sdata); if (!old_beacon) return -ENOENT; old_probe_resp = sdata_dereference(link->u.ap.probe_resp, sdata); old_fils_discovery = sdata_dereference(link->u.ap.fils_discovery, sdata); old_unsol_bcast_probe_resp = sdata_dereference(link->u.ap.unsol_bcast_probe_resp, sdata); /* abort any running channel switch or color change */ link_conf->csa_active = false; link_conf->color_change_active = false; ieee80211_vif_unblock_queues_csa(sdata); ieee80211_free_next_beacon(link); /* turn off carrier for this interface and dependent VLANs */ list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) netif_carrier_off(vlan->dev); if (ieee80211_num_beaconing_links(sdata) <= 1) { netif_carrier_off(dev); sdata->u.ap.active = false; } /* remove beacon and probe response */ RCU_INIT_POINTER(link->u.ap.beacon, NULL); RCU_INIT_POINTER(link->u.ap.probe_resp, NULL); RCU_INIT_POINTER(link->u.ap.fils_discovery, NULL); RCU_INIT_POINTER(link->u.ap.unsol_bcast_probe_resp, NULL); kfree_rcu(old_beacon, rcu_head); if (old_probe_resp) kfree_rcu(old_probe_resp, rcu_head); if (old_fils_discovery) kfree_rcu(old_fils_discovery, rcu_head); if (old_unsol_bcast_probe_resp) kfree_rcu(old_unsol_bcast_probe_resp, rcu_head); kfree(link_conf->ftmr_params); link_conf->ftmr_params = NULL; link_conf->bssid_index = 0; link_conf->nontransmitted = false; link_conf->ema_ap = false; link_conf->bssid_indicator = 0; __sta_info_flush(sdata, true, link_id, NULL); ieee80211_remove_link_keys(link, &keys); if (!list_empty(&keys)) { synchronize_net(); ieee80211_free_key_list(local, &keys); } ieee80211_stop_mbssid(sdata); RCU_INIT_POINTER(link_conf->tx_bss_conf, NULL); link_conf->enable_beacon = false; sdata->beacon_rate_set = false; sdata->vif.cfg.ssid_len = 0; clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state); ieee80211_link_info_change_notify(sdata, link, BSS_CHANGED_BEACON_ENABLED); if (sdata->wdev.links[link_id].cac_started) { chandef = link_conf->chanreq.oper; wiphy_delayed_work_cancel(wiphy, &link->dfs_cac_timer_work); cfg80211_cac_event(sdata->dev, &chandef, NL80211_RADAR_CAC_ABORTED, GFP_KERNEL, link_id); } drv_stop_ap(sdata->local, sdata, link_conf); /* free all potentially still buffered bcast frames */ local->total_ps_buffered -= skb_queue_len(&sdata->u.ap.ps.bc_buf); ieee80211_purge_tx_queue(&local->hw, &sdata->u.ap.ps.bc_buf); ieee80211_link_copy_chanctx_to_vlans(link, true); ieee80211_link_release_channel(link); return 0; } static int sta_apply_auth_flags(struct ieee80211_local *local, struct sta_info *sta, u32 mask, u32 set) { int ret; if (mask & BIT(NL80211_STA_FLAG_AUTHENTICATED) && set & BIT(NL80211_STA_FLAG_AUTHENTICATED) && !test_sta_flag(sta, WLAN_STA_AUTH)) { ret = sta_info_move_state(sta, IEEE80211_STA_AUTH); if (ret) return ret; } if (mask & BIT(NL80211_STA_FLAG_ASSOCIATED) && set & BIT(NL80211_STA_FLAG_ASSOCIATED) && !test_sta_flag(sta, WLAN_STA_ASSOC)) { /* * When peer becomes associated, init rate control as * well. Some drivers require rate control initialized * before drv_sta_state() is called. */ if (!test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) rate_control_rate_init_all_links(sta); ret = sta_info_move_state(sta, IEEE80211_STA_ASSOC); if (ret) return ret; } if (mask & BIT(NL80211_STA_FLAG_AUTHORIZED)) { if (set & BIT(NL80211_STA_FLAG_AUTHORIZED)) ret = sta_info_move_state(sta, IEEE80211_STA_AUTHORIZED); else if (test_sta_flag(sta, WLAN_STA_AUTHORIZED)) ret = sta_info_move_state(sta, IEEE80211_STA_ASSOC); else ret = 0; if (ret) return ret; } if (mask & BIT(NL80211_STA_FLAG_ASSOCIATED) && !(set & BIT(NL80211_STA_FLAG_ASSOCIATED)) && test_sta_flag(sta, WLAN_STA_ASSOC)) { ret = sta_info_move_state(sta, IEEE80211_STA_AUTH); if (ret) return ret; } if (mask & BIT(NL80211_STA_FLAG_AUTHENTICATED) && !(set & BIT(NL80211_STA_FLAG_AUTHENTICATED)) && test_sta_flag(sta, WLAN_STA_AUTH)) { ret = sta_info_move_state(sta, IEEE80211_STA_NONE); if (ret) return ret; } return 0; } static void sta_apply_mesh_params(struct ieee80211_local *local, struct sta_info *sta, struct station_parameters *params) { #ifdef CONFIG_MAC80211_MESH struct ieee80211_sub_if_data *sdata = sta->sdata; u64 changed = 0; if (params->sta_modify_mask & STATION_PARAM_APPLY_PLINK_STATE) { switch (params->plink_state) { case NL80211_PLINK_ESTAB: if (sta->mesh->plink_state != NL80211_PLINK_ESTAB) changed = mesh_plink_inc_estab_count(sdata); sta->mesh->plink_state = params->plink_state; sta->mesh->aid = params->peer_aid; ieee80211_mps_sta_status_update(sta); changed |= ieee80211_mps_set_sta_local_pm(sta, sdata->u.mesh.mshcfg.power_mode); ewma_mesh_tx_rate_avg_init(&sta->mesh->tx_rate_avg); /* init at low value */ ewma_mesh_tx_rate_avg_add(&sta->mesh->tx_rate_avg, 10); break; case NL80211_PLINK_LISTEN: case NL80211_PLINK_BLOCKED: case NL80211_PLINK_OPN_SNT: case NL80211_PLINK_OPN_RCVD: case NL80211_PLINK_CNF_RCVD: case NL80211_PLINK_HOLDING: if (sta->mesh->plink_state == NL80211_PLINK_ESTAB) changed = mesh_plink_dec_estab_count(sdata); sta->mesh->plink_state = params->plink_state; ieee80211_mps_sta_status_update(sta); changed |= ieee80211_mps_set_sta_local_pm(sta, NL80211_MESH_POWER_UNKNOWN); break; default: /* nothing */ break; } } switch (params->plink_action) { case NL80211_PLINK_ACTION_NO_ACTION: /* nothing */ break; case NL80211_PLINK_ACTION_OPEN: changed |= mesh_plink_open(sta); break; case NL80211_PLINK_ACTION_BLOCK: changed |= mesh_plink_block(sta); break; } if (params->local_pm) changed |= ieee80211_mps_set_sta_local_pm(sta, params->local_pm); ieee80211_mbss_info_change_notify(sdata, changed); #endif } enum sta_link_apply_mode { STA_LINK_MODE_NEW, STA_LINK_MODE_STA_MODIFY, STA_LINK_MODE_LINK_MODIFY, }; static int sta_link_apply_parameters(struct ieee80211_local *local, struct sta_info *sta, enum sta_link_apply_mode mode, struct link_station_parameters *params) { struct ieee80211_supported_band *sband; struct ieee80211_sub_if_data *sdata = sta->sdata; u32 link_id = params->link_id < 0 ? 0 : params->link_id; struct ieee80211_link_data *link = sdata_dereference(sdata->link[link_id], sdata); struct link_sta_info *link_sta = rcu_dereference_protected(sta->link[link_id], lockdep_is_held(&local->hw.wiphy->mtx)); bool changes = params->link_mac || params->txpwr_set || params->supported_rates_len || params->ht_capa || params->vht_capa || params->he_capa || params->eht_capa || params->opmode_notif_used; switch (mode) { case STA_LINK_MODE_NEW: if (!params->link_mac) return -EINVAL; break; case STA_LINK_MODE_LINK_MODIFY: break; case STA_LINK_MODE_STA_MODIFY: if (params->link_id >= 0) break; if (!changes) return 0; break; } if (!link || !link_sta) return -EINVAL; sband = ieee80211_get_link_sband(link); if (!sband) return -EINVAL; if (params->link_mac) { if (mode == STA_LINK_MODE_NEW) { memcpy(link_sta->addr, params->link_mac, ETH_ALEN); memcpy(link_sta->pub->addr, params->link_mac, ETH_ALEN); } else if (!ether_addr_equal(link_sta->addr, params->link_mac)) { return -EINVAL; } } if (params->txpwr_set) { int ret; link_sta->pub->txpwr.type = params->txpwr.type; if (params->txpwr.type == NL80211_TX_POWER_LIMITED) link_sta->pub->txpwr.power = params->txpwr.power; ret = drv_sta_set_txpwr(local, sdata, sta); if (ret) return ret; } if (params->supported_rates && params->supported_rates_len && !ieee80211_parse_bitrates(link->conf->chanreq.oper.width, sband, params->supported_rates, params->supported_rates_len, &link_sta->pub->supp_rates[sband->band])) return -EINVAL; if (params->ht_capa) ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband, params->ht_capa, link_sta); /* VHT can override some HT caps such as the A-MSDU max length */ if (params->vht_capa) ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, params->vht_capa, NULL, link_sta); if (params->he_capa) ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband, (void *)params->he_capa, params->he_capa_len, (void *)params->he_6ghz_capa, link_sta); if (params->he_capa && params->eht_capa) ieee80211_eht_cap_ie_to_sta_eht_cap(sdata, sband, (u8 *)params->he_capa, params->he_capa_len, params->eht_capa, params->eht_capa_len, link_sta); ieee80211_sta_init_nss(link_sta); if (params->opmode_notif_used) { /* returned value is only needed for rc update, but the * rc isn't initialized here yet, so ignore it */ __ieee80211_vht_handle_opmode(sdata, link_sta, params->opmode_notif, sband->band); } return 0; } static int sta_apply_parameters(struct ieee80211_local *local, struct sta_info *sta, struct station_parameters *params) { struct ieee80211_sub_if_data *sdata = sta->sdata; u32 mask, set; int ret = 0; mask = params->sta_flags_mask; set = params->sta_flags_set; if (ieee80211_vif_is_mesh(&sdata->vif)) { /* * In mesh mode, ASSOCIATED isn't part of the nl80211 * API but must follow AUTHENTICATED for driver state. */ if (mask & BIT(NL80211_STA_FLAG_AUTHENTICATED)) mask |= BIT(NL80211_STA_FLAG_ASSOCIATED); if (set & BIT(NL80211_STA_FLAG_AUTHENTICATED)) set |= BIT(NL80211_STA_FLAG_ASSOCIATED); } else if (test_sta_flag(sta, WLAN_STA_TDLS_PEER)) { /* * TDLS -- everything follows authorized, but * only becoming authorized is possible, not * going back */ if (set & BIT(NL80211_STA_FLAG_AUTHORIZED)) { set |= BIT(NL80211_STA_FLAG_AUTHENTICATED) | BIT(NL80211_STA_FLAG_ASSOCIATED); mask |= BIT(NL80211_STA_FLAG_AUTHENTICATED) | BIT(NL80211_STA_FLAG_ASSOCIATED); } } if (mask & BIT(NL80211_STA_FLAG_WME) && local->hw.queues >= IEEE80211_NUM_ACS) sta->sta.wme = set & BIT(NL80211_STA_FLAG_WME); /* auth flags will be set later for TDLS, * and for unassociated stations that move to associated */ if (!test_sta_flag(sta, WLAN_STA_TDLS_PEER) && !((mask & BIT(NL80211_STA_FLAG_ASSOCIATED)) && (set & BIT(NL80211_STA_FLAG_ASSOCIATED)))) { ret = sta_apply_auth_flags(local, sta, mask, set); if (ret) return ret; } if (mask & BIT(NL80211_STA_FLAG_SHORT_PREAMBLE)) { if (set & BIT(NL80211_STA_FLAG_SHORT_PREAMBLE)) set_sta_flag(sta, WLAN_STA_SHORT_PREAMBLE); else clear_sta_flag(sta, WLAN_STA_SHORT_PREAMBLE); } if (mask & BIT(NL80211_STA_FLAG_MFP)) { sta->sta.mfp = !!(set & BIT(NL80211_STA_FLAG_MFP)); if (set & BIT(NL80211_STA_FLAG_MFP)) set_sta_flag(sta, WLAN_STA_MFP); else clear_sta_flag(sta, WLAN_STA_MFP); } if (mask & BIT(NL80211_STA_FLAG_TDLS_PEER)) { if (set & BIT(NL80211_STA_FLAG_TDLS_PEER)) set_sta_flag(sta, WLAN_STA_TDLS_PEER); else clear_sta_flag(sta, WLAN_STA_TDLS_PEER); } if (mask & BIT(NL80211_STA_FLAG_SPP_AMSDU)) sta->sta.spp_amsdu = set & BIT(NL80211_STA_FLAG_SPP_AMSDU); /* mark TDLS channel switch support, if the AP allows it */ if (test_sta_flag(sta, WLAN_STA_TDLS_PEER) && !sdata->deflink.u.mgd.tdls_chan_switch_prohibited && params->ext_capab_len >= 4 && params->ext_capab[3] & WLAN_EXT_CAPA4_TDLS_CHAN_SWITCH) set_sta_flag(sta, WLAN_STA_TDLS_CHAN_SWITCH); if (test_sta_flag(sta, WLAN_STA_TDLS_PEER) && !sdata->u.mgd.tdls_wider_bw_prohibited && ieee80211_hw_check(&local->hw, TDLS_WIDER_BW) && params->ext_capab_len >= 8 && params->ext_capab[7] & WLAN_EXT_CAPA8_TDLS_WIDE_BW_ENABLED) set_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW); if (params->sta_modify_mask & STATION_PARAM_APPLY_UAPSD) { sta->sta.uapsd_queues = params->uapsd_queues; sta->sta.max_sp = params->max_sp; } ieee80211_sta_set_max_amsdu_subframes(sta, params->ext_capab, params->ext_capab_len); /* * cfg80211 validates this (1-2007) and allows setting the AID * only when creating a new station entry */ if (params->aid) sta->sta.aid = params->aid; /* * Some of the following updates would be racy if called on an * existing station, via ieee80211_change_station(). However, * all such changes are rejected by cfg80211 except for updates * changing the supported rates on an existing but not yet used * TDLS peer. */ if (params->listen_interval >= 0) sta->listen_interval = params->listen_interval; if (params->eml_cap_present) sta->sta.eml_cap = params->eml_cap; ret = sta_link_apply_parameters(local, sta, STA_LINK_MODE_STA_MODIFY, &params->link_sta_params); if (ret) return ret; if (params->support_p2p_ps >= 0) sta->sta.support_p2p_ps = params->support_p2p_ps; if (ieee80211_vif_is_mesh(&sdata->vif)) sta_apply_mesh_params(local, sta, params); if (params->airtime_weight) sta->airtime_weight = params->airtime_weight; /* set the STA state after all sta info from usermode has been set */ if (test_sta_flag(sta, WLAN_STA_TDLS_PEER) || set & BIT(NL80211_STA_FLAG_ASSOCIATED)) { ret = sta_apply_auth_flags(local, sta, mask, set); if (ret) return ret; } /* Mark the STA as MLO if MLD MAC address is available */ if (params->link_sta_params.mld_mac) sta->sta.mlo = true; return 0; } static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev, const u8 *mac, struct station_parameters *params) { struct ieee80211_local *local = wiphy_priv(wiphy); struct sta_info *sta; struct ieee80211_sub_if_data *sdata; int err; lockdep_assert_wiphy(local->hw.wiphy); if (params->vlan) { sdata = IEEE80211_DEV_TO_SUB_IF(params->vlan); if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN && sdata->vif.type != NL80211_IFTYPE_AP) return -EINVAL; } else sdata = IEEE80211_DEV_TO_SUB_IF(dev); if (ether_addr_equal(mac, sdata->vif.addr)) return -EINVAL; if (!is_valid_ether_addr(mac)) return -EINVAL; if (params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER) && sdata->vif.type == NL80211_IFTYPE_STATION && !sdata->u.mgd.associated) return -EINVAL; /* * If we have a link ID, it can be a non-MLO station on an AP MLD, * but we need to have a link_mac in that case as well, so use the * STA's MAC address in that case. */ if (params->link_sta_params.link_id >= 0) sta = sta_info_alloc_with_link(sdata, mac, params->link_sta_params.link_id, params->link_sta_params.link_mac ?: mac, GFP_KERNEL); else sta = sta_info_alloc(sdata, mac, GFP_KERNEL); if (!sta) return -ENOMEM; if (params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER)) sta->sta.tdls = true; /* Though the mutex is not needed here (since the station is not * visible yet), sta_apply_parameters (and inner functions) require * the mutex due to other paths. */ err = sta_apply_parameters(local, sta, params); if (err) { sta_info_free(local, sta); return err; } /* * for TDLS and for unassociated station, rate control should be * initialized only when rates are known and station is marked * authorized/associated */ if (!test_sta_flag(sta, WLAN_STA_TDLS_PEER) && test_sta_flag(sta, WLAN_STA_ASSOC)) rate_control_rate_init_all_links(sta); return sta_info_insert(sta); } static int ieee80211_del_station(struct wiphy *wiphy, struct net_device *dev, struct station_del_parameters *params) { struct ieee80211_sub_if_data *sdata; sdata = IEEE80211_DEV_TO_SUB_IF(dev); if (params->mac) return sta_info_destroy_addr_bss(sdata, params->mac); sta_info_flush(sdata, params->link_id); return 0; } static int ieee80211_change_station(struct wiphy *wiphy, struct net_device *dev, const u8 *mac, struct station_parameters *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = wiphy_priv(wiphy); struct sta_info *sta; struct ieee80211_sub_if_data *vlansdata; enum cfg80211_station_type statype; int err; lockdep_assert_wiphy(local->hw.wiphy); sta = sta_info_get_bss(sdata, mac); if (!sta) return -ENOENT; switch (sdata->vif.type) { case NL80211_IFTYPE_MESH_POINT: if (sdata->u.mesh.user_mpm) statype = CFG80211_STA_MESH_PEER_USER; else statype = CFG80211_STA_MESH_PEER_KERNEL; break; case NL80211_IFTYPE_ADHOC: statype = CFG80211_STA_IBSS; break; case NL80211_IFTYPE_STATION: if (!test_sta_flag(sta, WLAN_STA_TDLS_PEER)) { statype = CFG80211_STA_AP_STA; break; } if (test_sta_flag(sta, WLAN_STA_AUTHORIZED)) statype = CFG80211_STA_TDLS_PEER_ACTIVE; else statype = CFG80211_STA_TDLS_PEER_SETUP; break; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: if (test_sta_flag(sta, WLAN_STA_ASSOC)) statype = CFG80211_STA_AP_CLIENT; else statype = CFG80211_STA_AP_CLIENT_UNASSOC; break; default: return -EOPNOTSUPP; } err = cfg80211_check_station_change(wiphy, params, statype); if (err) return err; if (params->vlan && params->vlan != sta->sdata->dev) { vlansdata = IEEE80211_DEV_TO_SUB_IF(params->vlan); if (params->vlan->ieee80211_ptr->use_4addr) { if (vlansdata->u.vlan.sta) return -EBUSY; rcu_assign_pointer(vlansdata->u.vlan.sta, sta); __ieee80211_check_fast_rx_iface(vlansdata); drv_sta_set_4addr(local, sta->sdata, &sta->sta, true); } if (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN && sta->sdata->u.vlan.sta) RCU_INIT_POINTER(sta->sdata->u.vlan.sta, NULL); if (test_sta_flag(sta, WLAN_STA_AUTHORIZED)) ieee80211_vif_dec_num_mcast(sta->sdata); sta->sdata = vlansdata; ieee80211_check_fast_rx(sta); ieee80211_check_fast_xmit(sta); if (test_sta_flag(sta, WLAN_STA_AUTHORIZED)) { ieee80211_vif_inc_num_mcast(sta->sdata); cfg80211_send_layer2_update(sta->sdata->dev, sta->sta.addr); } } err = sta_apply_parameters(local, sta, params); if (err) return err; if (sdata->vif.type == NL80211_IFTYPE_STATION && params->sta_flags_mask & BIT(NL80211_STA_FLAG_AUTHORIZED)) { ieee80211_recalc_ps(local); ieee80211_recalc_ps_vif(sdata); } return 0; } #ifdef CONFIG_MAC80211_MESH static int ieee80211_add_mpath(struct wiphy *wiphy, struct net_device *dev, const u8 *dst, const u8 *next_hop) { struct ieee80211_sub_if_data *sdata; struct mesh_path *mpath; struct sta_info *sta; sdata = IEEE80211_DEV_TO_SUB_IF(dev); rcu_read_lock(); sta = sta_info_get(sdata, next_hop); if (!sta) { rcu_read_unlock(); return -ENOENT; } mpath = mesh_path_add(sdata, dst); if (IS_ERR(mpath)) { rcu_read_unlock(); return PTR_ERR(mpath); } mesh_path_fix_nexthop(mpath, sta); rcu_read_unlock(); return 0; } static int ieee80211_del_mpath(struct wiphy *wiphy, struct net_device *dev, const u8 *dst) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); if (dst) return mesh_path_del(sdata, dst); mesh_path_flush_by_iface(sdata); return 0; } static int ieee80211_change_mpath(struct wiphy *wiphy, struct net_device *dev, const u8 *dst, const u8 *next_hop) { struct ieee80211_sub_if_data *sdata; struct mesh_path *mpath; struct sta_info *sta; sdata = IEEE80211_DEV_TO_SUB_IF(dev); rcu_read_lock(); sta = sta_info_get(sdata, next_hop); if (!sta) { rcu_read_unlock(); return -ENOENT; } mpath = mesh_path_lookup(sdata, dst); if (!mpath) { rcu_read_unlock(); return -ENOENT; } mesh_path_fix_nexthop(mpath, sta); rcu_read_unlock(); return 0; } static void mpath_set_pinfo(struct mesh_path *mpath, u8 *next_hop, struct mpath_info *pinfo) { struct sta_info *next_hop_sta = rcu_dereference(mpath->next_hop); if (next_hop_sta) memcpy(next_hop, next_hop_sta->sta.addr, ETH_ALEN); else eth_zero_addr(next_hop); memset(pinfo, 0, sizeof(*pinfo)); pinfo->generation = mpath->sdata->u.mesh.mesh_paths_generation; pinfo->filled = MPATH_INFO_FRAME_QLEN | MPATH_INFO_SN | MPATH_INFO_METRIC | MPATH_INFO_EXPTIME | MPATH_INFO_DISCOVERY_TIMEOUT | MPATH_INFO_DISCOVERY_RETRIES | MPATH_INFO_FLAGS | MPATH_INFO_HOP_COUNT | MPATH_INFO_PATH_CHANGE; pinfo->frame_qlen = mpath->frame_queue.qlen; pinfo->sn = mpath->sn; pinfo->metric = mpath->metric; if (time_before(jiffies, mpath->exp_time)) pinfo->exptime = jiffies_to_msecs(mpath->exp_time - jiffies); pinfo->discovery_timeout = jiffies_to_msecs(mpath->discovery_timeout); pinfo->discovery_retries = mpath->discovery_retries; if (mpath->flags & MESH_PATH_ACTIVE) pinfo->flags |= NL80211_MPATH_FLAG_ACTIVE; if (mpath->flags & MESH_PATH_RESOLVING) pinfo->flags |= NL80211_MPATH_FLAG_RESOLVING; if (mpath->flags & MESH_PATH_SN_VALID) pinfo->flags |= NL80211_MPATH_FLAG_SN_VALID; if (mpath->flags & MESH_PATH_FIXED) pinfo->flags |= NL80211_MPATH_FLAG_FIXED; if (mpath->flags & MESH_PATH_RESOLVED) pinfo->flags |= NL80211_MPATH_FLAG_RESOLVED; pinfo->hop_count = mpath->hop_count; pinfo->path_change_count = mpath->path_change_count; } static int ieee80211_get_mpath(struct wiphy *wiphy, struct net_device *dev, u8 *dst, u8 *next_hop, struct mpath_info *pinfo) { struct ieee80211_sub_if_data *sdata; struct mesh_path *mpath; sdata = IEEE80211_DEV_TO_SUB_IF(dev); rcu_read_lock(); mpath = mesh_path_lookup(sdata, dst); if (!mpath) { rcu_read_unlock(); return -ENOENT; } memcpy(dst, mpath->dst, ETH_ALEN); mpath_set_pinfo(mpath, next_hop, pinfo); rcu_read_unlock(); return 0; } static int ieee80211_dump_mpath(struct wiphy *wiphy, struct net_device *dev, int idx, u8 *dst, u8 *next_hop, struct mpath_info *pinfo) { struct ieee80211_sub_if_data *sdata; struct mesh_path *mpath; sdata = IEEE80211_DEV_TO_SUB_IF(dev); rcu_read_lock(); mpath = mesh_path_lookup_by_idx(sdata, idx); if (!mpath) { rcu_read_unlock(); return -ENOENT; } memcpy(dst, mpath->dst, ETH_ALEN); mpath_set_pinfo(mpath, next_hop, pinfo); rcu_read_unlock(); return 0; } static void mpp_set_pinfo(struct mesh_path *mpath, u8 *mpp, struct mpath_info *pinfo) { memset(pinfo, 0, sizeof(*pinfo)); memcpy(mpp, mpath->mpp, ETH_ALEN); pinfo->generation = mpath->sdata->u.mesh.mpp_paths_generation; } static int ieee80211_get_mpp(struct wiphy *wiphy, struct net_device *dev, u8 *dst, u8 *mpp, struct mpath_info *pinfo) { struct ieee80211_sub_if_data *sdata; struct mesh_path *mpath; sdata = IEEE80211_DEV_TO_SUB_IF(dev); rcu_read_lock(); mpath = mpp_path_lookup(sdata, dst); if (!mpath) { rcu_read_unlock(); return -ENOENT; } memcpy(dst, mpath->dst, ETH_ALEN); mpp_set_pinfo(mpath, mpp, pinfo); rcu_read_unlock(); return 0; } static int ieee80211_dump_mpp(struct wiphy *wiphy, struct net_device *dev, int idx, u8 *dst, u8 *mpp, struct mpath_info *pinfo) { struct ieee80211_sub_if_data *sdata; struct mesh_path *mpath; sdata = IEEE80211_DEV_TO_SUB_IF(dev); rcu_read_lock(); mpath = mpp_path_lookup_by_idx(sdata, idx); if (!mpath) { rcu_read_unlock(); return -ENOENT; } memcpy(dst, mpath->dst, ETH_ALEN); mpp_set_pinfo(mpath, mpp, pinfo); rcu_read_unlock(); return 0; } static int ieee80211_get_mesh_config(struct wiphy *wiphy, struct net_device *dev, struct mesh_config *conf) { struct ieee80211_sub_if_data *sdata; sdata = IEEE80211_DEV_TO_SUB_IF(dev); memcpy(conf, &(sdata->u.mesh.mshcfg), sizeof(struct mesh_config)); return 0; } static inline bool _chg_mesh_attr(enum nl80211_meshconf_params parm, u32 mask) { return (mask >> (parm-1)) & 0x1; } static int copy_mesh_setup(struct ieee80211_if_mesh *ifmsh, const struct mesh_setup *setup) { u8 *new_ie; struct ieee80211_sub_if_data *sdata = container_of(ifmsh, struct ieee80211_sub_if_data, u.mesh); int i; /* allocate information elements */ new_ie = NULL; if (setup->ie_len) { new_ie = kmemdup(setup->ie, setup->ie_len, GFP_KERNEL); if (!new_ie) return -ENOMEM; } ifmsh->ie_len = setup->ie_len; ifmsh->ie = new_ie; /* now copy the rest of the setup parameters */ ifmsh->mesh_id_len = setup->mesh_id_len; memcpy(ifmsh->mesh_id, setup->mesh_id, ifmsh->mesh_id_len); ifmsh->mesh_sp_id = setup->sync_method; ifmsh->mesh_pp_id = setup->path_sel_proto; ifmsh->mesh_pm_id = setup->path_metric; ifmsh->user_mpm = setup->user_mpm; ifmsh->mesh_auth_id = setup->auth_id; ifmsh->security = IEEE80211_MESH_SEC_NONE; ifmsh->userspace_handles_dfs = setup->userspace_handles_dfs; if (setup->is_authenticated) ifmsh->security |= IEEE80211_MESH_SEC_AUTHED; if (setup->is_secure) ifmsh->security |= IEEE80211_MESH_SEC_SECURED; /* mcast rate setting in Mesh Node */ memcpy(sdata->vif.bss_conf.mcast_rate, setup->mcast_rate, sizeof(setup->mcast_rate)); sdata->vif.bss_conf.basic_rates = setup->basic_rates; sdata->vif.bss_conf.beacon_int = setup->beacon_interval; sdata->vif.bss_conf.dtim_period = setup->dtim_period; sdata->beacon_rate_set = false; if (wiphy_ext_feature_isset(sdata->local->hw.wiphy, NL80211_EXT_FEATURE_BEACON_RATE_LEGACY)) { for (i = 0; i < NUM_NL80211_BANDS; i++) { sdata->beacon_rateidx_mask[i] = setup->beacon_rate.control[i].legacy; if (sdata->beacon_rateidx_mask[i]) sdata->beacon_rate_set = true; } } return 0; } static int ieee80211_update_mesh_config(struct wiphy *wiphy, struct net_device *dev, u32 mask, const struct mesh_config *nconf) { struct mesh_config *conf; struct ieee80211_sub_if_data *sdata; struct ieee80211_if_mesh *ifmsh; sdata = IEEE80211_DEV_TO_SUB_IF(dev); ifmsh = &sdata->u.mesh; /* Set the config options which we are interested in setting */ conf = &(sdata->u.mesh.mshcfg); if (_chg_mesh_attr(NL80211_MESHCONF_RETRY_TIMEOUT, mask)) conf->dot11MeshRetryTimeout = nconf->dot11MeshRetryTimeout; if (_chg_mesh_attr(NL80211_MESHCONF_CONFIRM_TIMEOUT, mask)) conf->dot11MeshConfirmTimeout = nconf->dot11MeshConfirmTimeout; if (_chg_mesh_attr(NL80211_MESHCONF_HOLDING_TIMEOUT, mask)) conf->dot11MeshHoldingTimeout = nconf->dot11MeshHoldingTimeout; if (_chg_mesh_attr(NL80211_MESHCONF_MAX_PEER_LINKS, mask)) conf->dot11MeshMaxPeerLinks = nconf->dot11MeshMaxPeerLinks; if (_chg_mesh_attr(NL80211_MESHCONF_MAX_RETRIES, mask)) conf->dot11MeshMaxRetries = nconf->dot11MeshMaxRetries; if (_chg_mesh_attr(NL80211_MESHCONF_TTL, mask)) conf->dot11MeshTTL = nconf->dot11MeshTTL; if (_chg_mesh_attr(NL80211_MESHCONF_ELEMENT_TTL, mask)) conf->element_ttl = nconf->element_ttl; if (_chg_mesh_attr(NL80211_MESHCONF_AUTO_OPEN_PLINKS, mask)) { if (ifmsh->user_mpm) return -EBUSY; conf->auto_open_plinks = nconf->auto_open_plinks; } if (_chg_mesh_attr(NL80211_MESHCONF_SYNC_OFFSET_MAX_NEIGHBOR, mask)) conf->dot11MeshNbrOffsetMaxNeighbor = nconf->dot11MeshNbrOffsetMaxNeighbor; if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES, mask)) conf->dot11MeshHWMPmaxPREQretries = nconf->dot11MeshHWMPmaxPREQretries; if (_chg_mesh_attr(NL80211_MESHCONF_PATH_REFRESH_TIME, mask)) conf->path_refresh_time = nconf->path_refresh_time; if (_chg_mesh_attr(NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT, mask)) conf->min_discovery_timeout = nconf->min_discovery_timeout; if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT, mask)) conf->dot11MeshHWMPactivePathTimeout = nconf->dot11MeshHWMPactivePathTimeout; if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL, mask)) conf->dot11MeshHWMPpreqMinInterval = nconf->dot11MeshHWMPpreqMinInterval; if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_PERR_MIN_INTERVAL, mask)) conf->dot11MeshHWMPperrMinInterval = nconf->dot11MeshHWMPperrMinInterval; if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME, mask)) conf->dot11MeshHWMPnetDiameterTraversalTime = nconf->dot11MeshHWMPnetDiameterTraversalTime; if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_ROOTMODE, mask)) { conf->dot11MeshHWMPRootMode = nconf->dot11MeshHWMPRootMode; ieee80211_mesh_root_setup(ifmsh); } if (_chg_mesh_attr(NL80211_MESHCONF_GATE_ANNOUNCEMENTS, mask)) { /* our current gate announcement implementation rides on root * announcements, so require this ifmsh to also be a root node * */ if (nconf->dot11MeshGateAnnouncementProtocol && !(conf->dot11MeshHWMPRootMode > IEEE80211_ROOTMODE_ROOT)) { conf->dot11MeshHWMPRootMode = IEEE80211_PROACTIVE_RANN; ieee80211_mesh_root_setup(ifmsh); } conf->dot11MeshGateAnnouncementProtocol = nconf->dot11MeshGateAnnouncementProtocol; } if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_RANN_INTERVAL, mask)) conf->dot11MeshHWMPRannInterval = nconf->dot11MeshHWMPRannInterval; if (_chg_mesh_attr(NL80211_MESHCONF_FORWARDING, mask)) conf->dot11MeshForwarding = nconf->dot11MeshForwarding; if (_chg_mesh_attr(NL80211_MESHCONF_RSSI_THRESHOLD, mask)) { /* our RSSI threshold implementation is supported only for * devices that report signal in dBm. */ if (!ieee80211_hw_check(&sdata->local->hw, SIGNAL_DBM)) return -EOPNOTSUPP; conf->rssi_threshold = nconf->rssi_threshold; } if (_chg_mesh_attr(NL80211_MESHCONF_HT_OPMODE, mask)) { conf->ht_opmode = nconf->ht_opmode; sdata->vif.bss_conf.ht_operation_mode = nconf->ht_opmode; ieee80211_link_info_change_notify(sdata, &sdata->deflink, BSS_CHANGED_HT); } if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_PATH_TO_ROOT_TIMEOUT, mask)) conf->dot11MeshHWMPactivePathToRootTimeout = nconf->dot11MeshHWMPactivePathToRootTimeout; if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_ROOT_INTERVAL, mask)) conf->dot11MeshHWMProotInterval = nconf->dot11MeshHWMProotInterval; if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_CONFIRMATION_INTERVAL, mask)) conf->dot11MeshHWMPconfirmationInterval = nconf->dot11MeshHWMPconfirmationInterval; if (_chg_mesh_attr(NL80211_MESHCONF_POWER_MODE, mask)) { conf->power_mode = nconf->power_mode; ieee80211_mps_local_status_update(sdata); } if (_chg_mesh_attr(NL80211_MESHCONF_AWAKE_WINDOW, mask)) conf->dot11MeshAwakeWindowDuration = nconf->dot11MeshAwakeWindowDuration; if (_chg_mesh_attr(NL80211_MESHCONF_PLINK_TIMEOUT, mask)) conf->plink_timeout = nconf->plink_timeout; if (_chg_mesh_attr(NL80211_MESHCONF_CONNECTED_TO_GATE, mask)) conf->dot11MeshConnectedToMeshGate = nconf->dot11MeshConnectedToMeshGate; if (_chg_mesh_attr(NL80211_MESHCONF_NOLEARN, mask)) conf->dot11MeshNolearn = nconf->dot11MeshNolearn; if (_chg_mesh_attr(NL80211_MESHCONF_CONNECTED_TO_AS, mask)) conf->dot11MeshConnectedToAuthServer = nconf->dot11MeshConnectedToAuthServer; ieee80211_mbss_info_change_notify(sdata, BSS_CHANGED_BEACON); return 0; } static int ieee80211_join_mesh(struct wiphy *wiphy, struct net_device *dev, const struct mesh_config *conf, const struct mesh_setup *setup) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_chan_req chanreq = { .oper = setup->chandef }; struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; int err; lockdep_assert_wiphy(sdata->local->hw.wiphy); memcpy(&ifmsh->mshcfg, conf, sizeof(struct mesh_config)); err = copy_mesh_setup(ifmsh, setup); if (err) return err; sdata->control_port_over_nl80211 = setup->control_port_over_nl80211; /* can mesh use other SMPS modes? */ sdata->deflink.smps_mode = IEEE80211_SMPS_OFF; sdata->deflink.needed_rx_chains = sdata->local->rx_chains; err = ieee80211_link_use_channel(&sdata->deflink, &chanreq, IEEE80211_CHANCTX_SHARED); if (err) return err; return ieee80211_start_mesh(sdata); } static int ieee80211_leave_mesh(struct wiphy *wiphy, struct net_device *dev) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); lockdep_assert_wiphy(sdata->local->hw.wiphy); ieee80211_stop_mesh(sdata); ieee80211_link_release_channel(&sdata->deflink); kfree(sdata->u.mesh.ie); return 0; } #endif static int ieee80211_change_bss(struct wiphy *wiphy, struct net_device *dev, struct bss_parameters *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_link_data *link; struct ieee80211_supported_band *sband; u64 changed = 0; link = ieee80211_link_or_deflink(sdata, params->link_id, true); if (IS_ERR(link)) return PTR_ERR(link); if (!sdata_dereference(link->u.ap.beacon, sdata)) return -ENOENT; sband = ieee80211_get_link_sband(link); if (!sband) return -EINVAL; if (params->basic_rates) { if (!ieee80211_parse_bitrates(link->conf->chanreq.oper.width, wiphy->bands[sband->band], params->basic_rates, params->basic_rates_len, &link->conf->basic_rates)) return -EINVAL; changed |= BSS_CHANGED_BASIC_RATES; ieee80211_check_rate_mask(link); } if (params->use_cts_prot >= 0) { link->conf->use_cts_prot = params->use_cts_prot; changed |= BSS_CHANGED_ERP_CTS_PROT; } if (params->use_short_preamble >= 0) { link->conf->use_short_preamble = params->use_short_preamble; changed |= BSS_CHANGED_ERP_PREAMBLE; } if (!link->conf->use_short_slot && (sband->band == NL80211_BAND_5GHZ || sband->band == NL80211_BAND_6GHZ)) { link->conf->use_short_slot = true; changed |= BSS_CHANGED_ERP_SLOT; } if (params->use_short_slot_time >= 0) { link->conf->use_short_slot = params->use_short_slot_time; changed |= BSS_CHANGED_ERP_SLOT; } if (params->ap_isolate >= 0) { if (params->ap_isolate) sdata->flags |= IEEE80211_SDATA_DONT_BRIDGE_PACKETS; else sdata->flags &= ~IEEE80211_SDATA_DONT_BRIDGE_PACKETS; ieee80211_check_fast_rx_iface(sdata); } if (params->ht_opmode >= 0) { link->conf->ht_operation_mode = (u16)params->ht_opmode; changed |= BSS_CHANGED_HT; } if (params->p2p_ctwindow >= 0) { link->conf->p2p_noa_attr.oppps_ctwindow &= ~IEEE80211_P2P_OPPPS_CTWINDOW_MASK; link->conf->p2p_noa_attr.oppps_ctwindow |= params->p2p_ctwindow & IEEE80211_P2P_OPPPS_CTWINDOW_MASK; changed |= BSS_CHANGED_P2P_PS; } if (params->p2p_opp_ps > 0) { link->conf->p2p_noa_attr.oppps_ctwindow |= IEEE80211_P2P_OPPPS_ENABLE_BIT; changed |= BSS_CHANGED_P2P_PS; } else if (params->p2p_opp_ps == 0) { link->conf->p2p_noa_attr.oppps_ctwindow &= ~IEEE80211_P2P_OPPPS_ENABLE_BIT; changed |= BSS_CHANGED_P2P_PS; } ieee80211_link_info_change_notify(sdata, link, changed); return 0; } static int ieee80211_set_txq_params(struct wiphy *wiphy, struct net_device *dev, struct ieee80211_txq_params *params) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_link_data *link = ieee80211_link_or_deflink(sdata, params->link_id, true); struct ieee80211_tx_queue_params p; if (!local->ops->conf_tx) return -EOPNOTSUPP; if (local->hw.queues < IEEE80211_NUM_ACS) return -EOPNOTSUPP; if (IS_ERR(link)) return PTR_ERR(link); memset(&p, 0, sizeof(p)); p.aifs = params->aifs; p.cw_max = params->cwmax; p.cw_min = params->cwmin; p.txop = params->txop; /* * Setting tx queue params disables u-apsd because it's only * called in master mode. */ p.uapsd = false; ieee80211_regulatory_limit_wmm_params(sdata, &p, params->ac); link->tx_conf[params->ac] = p; if (drv_conf_tx(local, link, params->ac, &p)) { wiphy_debug(local->hw.wiphy, "failed to set TX queue parameters for AC %d\n", params->ac); return -EINVAL; } ieee80211_link_info_change_notify(sdata, link, BSS_CHANGED_QOS); return 0; } #ifdef CONFIG_PM static int ieee80211_suspend(struct wiphy *wiphy, struct cfg80211_wowlan *wowlan) { return __ieee80211_suspend(wiphy_priv(wiphy), wowlan); } static int ieee80211_resume(struct wiphy *wiphy) { return __ieee80211_resume(wiphy_priv(wiphy)); } #else #define ieee80211_suspend NULL #define ieee80211_resume NULL #endif static int ieee80211_scan(struct wiphy *wiphy, struct cfg80211_scan_request *req) { struct ieee80211_sub_if_data *sdata; sdata = IEEE80211_WDEV_TO_SUB_IF(req->wdev); switch (ieee80211_vif_type_p2p(&sdata->vif)) { case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_ADHOC: case NL80211_IFTYPE_MESH_POINT: case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_P2P_DEVICE: break; case NL80211_IFTYPE_P2P_GO: if (sdata->local->ops->hw_scan) break; /* * FIXME: implement NoA while scanning in software, * for now fall through to allow scanning only when * beaconing hasn't been configured yet */ fallthrough; case NL80211_IFTYPE_AP: /* * If the scan has been forced (and the driver supports * forcing), don't care about being beaconing already. * This will create problems to the attached stations (e.g. all * the frames sent while scanning on other channel will be * lost) */ if (sdata->deflink.u.ap.beacon && (!(wiphy->features & NL80211_FEATURE_AP_SCAN) || !(req->flags & NL80211_SCAN_FLAG_AP))) return -EOPNOTSUPP; break; case NL80211_IFTYPE_NAN: default: return -EOPNOTSUPP; } return ieee80211_request_scan(sdata, req); } static void ieee80211_abort_scan(struct wiphy *wiphy, struct wireless_dev *wdev) { ieee80211_scan_cancel(wiphy_priv(wiphy)); } static int ieee80211_sched_scan_start(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_sched_scan_request *req) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); if (!sdata->local->ops->sched_scan_start) return -EOPNOTSUPP; return ieee80211_request_sched_scan_start(sdata, req); } static int ieee80211_sched_scan_stop(struct wiphy *wiphy, struct net_device *dev, u64 reqid) { struct ieee80211_local *local = wiphy_priv(wiphy); if (!local->ops->sched_scan_stop) return -EOPNOTSUPP; return ieee80211_request_sched_scan_stop(local); } static int ieee80211_auth(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_auth_request *req) { return ieee80211_mgd_auth(IEEE80211_DEV_TO_SUB_IF(dev), req); } static int ieee80211_assoc(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_assoc_request *req) { return ieee80211_mgd_assoc(IEEE80211_DEV_TO_SUB_IF(dev), req); } static int ieee80211_deauth(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_deauth_request *req) { return ieee80211_mgd_deauth(IEEE80211_DEV_TO_SUB_IF(dev), req); } static int ieee80211_disassoc(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_disassoc_request *req) { return ieee80211_mgd_disassoc(IEEE80211_DEV_TO_SUB_IF(dev), req); } static int ieee80211_join_ibss(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_ibss_params *params) { return ieee80211_ibss_join(IEEE80211_DEV_TO_SUB_IF(dev), params); } static int ieee80211_leave_ibss(struct wiphy *wiphy, struct net_device *dev) { return ieee80211_ibss_leave(IEEE80211_DEV_TO_SUB_IF(dev)); } static int ieee80211_join_ocb(struct wiphy *wiphy, struct net_device *dev, struct ocb_setup *setup) { return ieee80211_ocb_join(IEEE80211_DEV_TO_SUB_IF(dev), setup); } static int ieee80211_leave_ocb(struct wiphy *wiphy, struct net_device *dev) { return ieee80211_ocb_leave(IEEE80211_DEV_TO_SUB_IF(dev)); } static int ieee80211_set_mcast_rate(struct wiphy *wiphy, struct net_device *dev, int rate[NUM_NL80211_BANDS]) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); memcpy(sdata->vif.bss_conf.mcast_rate, rate, sizeof(int) * NUM_NL80211_BANDS); if (ieee80211_sdata_running(sdata)) ieee80211_link_info_change_notify(sdata, &sdata->deflink, BSS_CHANGED_MCAST_RATE); return 0; } static int ieee80211_set_wiphy_params(struct wiphy *wiphy, u32 changed) { struct ieee80211_local *local = wiphy_priv(wiphy); int err; if (changed & WIPHY_PARAM_FRAG_THRESHOLD) { ieee80211_check_fast_xmit_all(local); err = drv_set_frag_threshold(local, wiphy->frag_threshold); if (err) { ieee80211_check_fast_xmit_all(local); return err; } } if ((changed & WIPHY_PARAM_COVERAGE_CLASS) || (changed & WIPHY_PARAM_DYN_ACK)) { s16 coverage_class; coverage_class = changed & WIPHY_PARAM_COVERAGE_CLASS ? wiphy->coverage_class : -1; err = drv_set_coverage_class(local, coverage_class); if (err) return err; } if (changed & WIPHY_PARAM_RTS_THRESHOLD) { err = drv_set_rts_threshold(local, wiphy->rts_threshold); if (err) return err; } if (changed & WIPHY_PARAM_RETRY_SHORT) { if (wiphy->retry_short > IEEE80211_MAX_TX_RETRY) return -EINVAL; local->hw.conf.short_frame_max_tx_count = wiphy->retry_short; } if (changed & WIPHY_PARAM_RETRY_LONG) { if (wiphy->retry_long > IEEE80211_MAX_TX_RETRY) return -EINVAL; local->hw.conf.long_frame_max_tx_count = wiphy->retry_long; } if (changed & (WIPHY_PARAM_RETRY_SHORT | WIPHY_PARAM_RETRY_LONG)) ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_RETRY_LIMITS); if (changed & (WIPHY_PARAM_TXQ_LIMIT | WIPHY_PARAM_TXQ_MEMORY_LIMIT | WIPHY_PARAM_TXQ_QUANTUM)) ieee80211_txq_set_params(local); return 0; } static int ieee80211_set_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, enum nl80211_tx_power_setting type, int mbm) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata; enum nl80211_tx_power_setting txp_type = type; bool update_txp_type = false; bool has_monitor = false; int user_power_level; int old_power = local->user_power_level; lockdep_assert_wiphy(local->hw.wiphy); switch (type) { case NL80211_TX_POWER_AUTOMATIC: user_power_level = IEEE80211_UNSET_POWER_LEVEL; txp_type = NL80211_TX_POWER_LIMITED; break; case NL80211_TX_POWER_LIMITED: case NL80211_TX_POWER_FIXED: if (mbm < 0 || (mbm % 100)) return -EOPNOTSUPP; user_power_level = MBM_TO_DBM(mbm); break; default: return -EINVAL; } if (wdev) { sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); if (sdata->vif.type == NL80211_IFTYPE_MONITOR && !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) { if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) return -EOPNOTSUPP; sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata); if (!sdata) return -EOPNOTSUPP; } for (int link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { struct ieee80211_link_data *link = wiphy_dereference(wiphy, sdata->link[link_id]); if (!link) continue; link->user_power_level = user_power_level; if (txp_type != link->conf->txpower_type) { update_txp_type = true; link->conf->txpower_type = txp_type; } ieee80211_recalc_txpower(link, update_txp_type); } return 0; } local->user_power_level = user_power_level; list_for_each_entry(sdata, &local->interfaces, list) { if (sdata->vif.type == NL80211_IFTYPE_MONITOR && !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) { has_monitor = true; continue; } for (int link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { struct ieee80211_link_data *link = wiphy_dereference(wiphy, sdata->link[link_id]); if (!link) continue; link->user_power_level = local->user_power_level; if (txp_type != link->conf->txpower_type) update_txp_type = true; link->conf->txpower_type = txp_type; } } list_for_each_entry(sdata, &local->interfaces, list) { if (sdata->vif.type == NL80211_IFTYPE_MONITOR && !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) continue; for (int link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { struct ieee80211_link_data *link = wiphy_dereference(wiphy, sdata->link[link_id]); if (!link) continue; ieee80211_recalc_txpower(link, update_txp_type); } } if (has_monitor) { sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata); if (sdata && ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) { sdata->deflink.user_power_level = local->user_power_level; if (txp_type != sdata->vif.bss_conf.txpower_type) update_txp_type = true; sdata->vif.bss_conf.txpower_type = txp_type; ieee80211_recalc_txpower(&sdata->deflink, update_txp_type); } } if (local->emulate_chanctx && (old_power != local->user_power_level)) ieee80211_hw_conf_chan(local); return 0; } static int ieee80211_get_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, unsigned int link_id, int *dbm) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); struct ieee80211_link_data *link_data; if (local->ops->get_txpower && (sdata->flags & IEEE80211_SDATA_IN_DRIVER)) return drv_get_txpower(local, sdata, link_id, dbm); if (local->emulate_chanctx) { *dbm = local->hw.conf.power_level; } else { link_data = wiphy_dereference(wiphy, sdata->link[link_id]); if (link_data) *dbm = link_data->conf->txpower; else return -ENOLINK; } /* INT_MIN indicates no power level was set yet */ if (*dbm == INT_MIN) return -EINVAL; return 0; } static void ieee80211_rfkill_poll(struct wiphy *wiphy) { struct ieee80211_local *local = wiphy_priv(wiphy); drv_rfkill_poll(local); } #ifdef CONFIG_NL80211_TESTMODE static int ieee80211_testmode_cmd(struct wiphy *wiphy, struct wireless_dev *wdev, void *data, int len) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_vif *vif = NULL; if (!local->ops->testmode_cmd) return -EOPNOTSUPP; if (wdev) { struct ieee80211_sub_if_data *sdata; sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); if (sdata->flags & IEEE80211_SDATA_IN_DRIVER) vif = &sdata->vif; } return local->ops->testmode_cmd(&local->hw, vif, data, len); } static int ieee80211_testmode_dump(struct wiphy *wiphy, struct sk_buff *skb, struct netlink_callback *cb, void *data, int len) { struct ieee80211_local *local = wiphy_priv(wiphy); if (!local->ops->testmode_dump) return -EOPNOTSUPP; return local->ops->testmode_dump(&local->hw, skb, cb, data, len); } #endif int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link, enum ieee80211_smps_mode smps_mode) { const u8 *ap; enum ieee80211_smps_mode old_req; int err; struct sta_info *sta; bool tdls_peer_found = false; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION)) return -EINVAL; if (!ieee80211_vif_link_active(&sdata->vif, link->link_id)) return 0; old_req = link->u.mgd.req_smps; link->u.mgd.req_smps = smps_mode; /* The driver indicated that EML is enabled for the interface, which * implies that SMPS flows towards the AP should be stopped. */ if (sdata->vif.driver_flags & IEEE80211_VIF_EML_ACTIVE) return 0; if (old_req == smps_mode && smps_mode != IEEE80211_SMPS_AUTOMATIC) return 0; /* * If not associated, or current association is not an HT * association, there's no need to do anything, just store * the new value until we associate. */ if (!sdata->u.mgd.associated || link->conf->chanreq.oper.width == NL80211_CHAN_WIDTH_20_NOHT) return 0; ap = sdata->vif.cfg.ap_addr; rcu_read_lock(); list_for_each_entry_rcu(sta, &sdata->local->sta_list, list) { if (!sta->sta.tdls || sta->sdata != sdata || !sta->uploaded || !test_sta_flag(sta, WLAN_STA_AUTHORIZED)) continue; tdls_peer_found = true; break; } rcu_read_unlock(); if (smps_mode == IEEE80211_SMPS_AUTOMATIC) { if (tdls_peer_found || !sdata->u.mgd.powersave) smps_mode = IEEE80211_SMPS_OFF; else smps_mode = IEEE80211_SMPS_DYNAMIC; } /* send SM PS frame to AP */ err = ieee80211_send_smps_action(sdata, smps_mode, ap, ap, ieee80211_vif_is_mld(&sdata->vif) ? link->link_id : -1); if (err) link->u.mgd.req_smps = old_req; else if (smps_mode != IEEE80211_SMPS_OFF && tdls_peer_found) ieee80211_teardown_tdls_peers(link); return err; } static int ieee80211_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev, bool enabled, int timeout) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); unsigned int link_id; if (sdata->vif.type != NL80211_IFTYPE_STATION) return -EOPNOTSUPP; if (!ieee80211_hw_check(&local->hw, SUPPORTS_PS)) return -EOPNOTSUPP; if (enabled == sdata->u.mgd.powersave && timeout == local->dynamic_ps_forced_timeout) return 0; sdata->u.mgd.powersave = enabled; local->dynamic_ps_forced_timeout = timeout; /* no change, but if automatic follow powersave */ for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { struct ieee80211_link_data *link; link = sdata_dereference(sdata->link[link_id], sdata); if (!link) continue; __ieee80211_request_smps_mgd(sdata, link, link->u.mgd.req_smps); } if (ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS)) ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); ieee80211_recalc_ps(local); ieee80211_recalc_ps_vif(sdata); ieee80211_check_fast_rx_iface(sdata); return 0; } static void ieee80211_set_cqm_rssi_link(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link, s32 rssi_thold, u32 rssi_hyst, s32 rssi_low, s32 rssi_high) { struct ieee80211_bss_conf *conf; if (!link || !link->conf) return; conf = link->conf; if (rssi_thold && rssi_hyst && rssi_thold == conf->cqm_rssi_thold && rssi_hyst == conf->cqm_rssi_hyst) return; conf->cqm_rssi_thold = rssi_thold; conf->cqm_rssi_hyst = rssi_hyst; conf->cqm_rssi_low = rssi_low; conf->cqm_rssi_high = rssi_high; link->u.mgd.last_cqm_event_signal = 0; if (!ieee80211_vif_link_active(&sdata->vif, link->link_id)) return; if (sdata->u.mgd.associated && (sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI)) ieee80211_link_info_change_notify(sdata, link, BSS_CHANGED_CQM); } static int ieee80211_set_cqm_rssi_config(struct wiphy *wiphy, struct net_device *dev, s32 rssi_thold, u32 rssi_hyst) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_vif *vif = &sdata->vif; int link_id; if (vif->driver_flags & IEEE80211_VIF_BEACON_FILTER && !(vif->driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI)) return -EOPNOTSUPP; /* For MLD, handle CQM change on all the active links */ for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { struct ieee80211_link_data *link = sdata_dereference(sdata->link[link_id], sdata); ieee80211_set_cqm_rssi_link(sdata, link, rssi_thold, rssi_hyst, 0, 0); } return 0; } static int ieee80211_set_cqm_rssi_range_config(struct wiphy *wiphy, struct net_device *dev, s32 rssi_low, s32 rssi_high) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_vif *vif = &sdata->vif; int link_id; if (vif->driver_flags & IEEE80211_VIF_BEACON_FILTER) return -EOPNOTSUPP; /* For MLD, handle CQM change on all the active links */ for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { struct ieee80211_link_data *link = sdata_dereference(sdata->link[link_id], sdata); ieee80211_set_cqm_rssi_link(sdata, link, 0, 0, rssi_low, rssi_high); } return 0; } static int ieee80211_set_bitrate_mask(struct wiphy *wiphy, struct net_device *dev, unsigned int link_id, const u8 *addr, const struct cfg80211_bitrate_mask *mask) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); int i, ret; if (!ieee80211_sdata_running(sdata)) return -ENETDOWN; /* * If active validate the setting and reject it if it doesn't leave * at least one basic rate usable, since we really have to be able * to send something, and if we're an AP we have to be able to do * so at a basic rate so that all clients can receive it. */ if (rcu_access_pointer(sdata->vif.bss_conf.chanctx_conf) && sdata->vif.bss_conf.chanreq.oper.chan) { u32 basic_rates = sdata->vif.bss_conf.basic_rates; enum nl80211_band band; band = sdata->vif.bss_conf.chanreq.oper.chan->band; if (!(mask->control[band].legacy & basic_rates)) return -EINVAL; } if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) { ret = drv_set_bitrate_mask(local, sdata, mask); if (ret) return ret; } for (i = 0; i < NUM_NL80211_BANDS; i++) { struct ieee80211_supported_band *sband = wiphy->bands[i]; int j; sdata->rc_rateidx_mask[i] = mask->control[i].legacy; memcpy(sdata->rc_rateidx_mcs_mask[i], mask->control[i].ht_mcs, sizeof(mask->control[i].ht_mcs)); memcpy(sdata->rc_rateidx_vht_mcs_mask[i], mask->control[i].vht_mcs, sizeof(mask->control[i].vht_mcs)); sdata->rc_has_mcs_mask[i] = false; sdata->rc_has_vht_mcs_mask[i] = false; if (!sband) continue; for (j = 0; j < IEEE80211_HT_MCS_MASK_LEN; j++) { if (sdata->rc_rateidx_mcs_mask[i][j] != 0xff) { sdata->rc_has_mcs_mask[i] = true; break; } } for (j = 0; j < NL80211_VHT_NSS_MAX; j++) { if (sdata->rc_rateidx_vht_mcs_mask[i][j] != 0xffff) { sdata->rc_has_vht_mcs_mask[i] = true; break; } } } return 0; } static int ieee80211_start_radar_detection(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_chan_def *chandef, u32 cac_time_ms, int link_id) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_chan_req chanreq = { .oper = *chandef }; struct ieee80211_local *local = sdata->local; struct ieee80211_link_data *link_data; int err; lockdep_assert_wiphy(local->hw.wiphy); if (!list_empty(&local->roc_list) || local->scanning) return -EBUSY; link_data = sdata_dereference(sdata->link[link_id], sdata); if (!link_data) return -ENOLINK; /* whatever, but channel contexts should not complain about that one */ link_data->smps_mode = IEEE80211_SMPS_OFF; link_data->needed_rx_chains = local->rx_chains; err = ieee80211_link_use_channel(link_data, &chanreq, IEEE80211_CHANCTX_SHARED); if (err) return err; wiphy_delayed_work_queue(wiphy, &link_data->dfs_cac_timer_work, msecs_to_jiffies(cac_time_ms)); return 0; } static void ieee80211_end_cac(struct wiphy *wiphy, struct net_device *dev, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct ieee80211_link_data *link_data; lockdep_assert_wiphy(local->hw.wiphy); list_for_each_entry(sdata, &local->interfaces, list) { link_data = sdata_dereference(sdata->link[link_id], sdata); if (!link_data) continue; wiphy_delayed_work_cancel(wiphy, &link_data->dfs_cac_timer_work); if (sdata->wdev.links[link_id].cac_started) { ieee80211_link_release_channel(link_data); sdata->wdev.links[link_id].cac_started = false; } } } static struct cfg80211_beacon_data * cfg80211_beacon_dup(struct cfg80211_beacon_data *beacon) { struct cfg80211_beacon_data *new_beacon; u8 *pos; int len; len = beacon->head_len + beacon->tail_len + beacon->beacon_ies_len + beacon->proberesp_ies_len + beacon->assocresp_ies_len + beacon->probe_resp_len + beacon->lci_len + beacon->civicloc_len; if (beacon->mbssid_ies) len += ieee80211_get_mbssid_beacon_len(beacon->mbssid_ies, beacon->rnr_ies, beacon->mbssid_ies->cnt); new_beacon = kzalloc(sizeof(*new_beacon) + len, GFP_KERNEL); if (!new_beacon) return NULL; if (beacon->mbssid_ies && beacon->mbssid_ies->cnt) { new_beacon->mbssid_ies = kzalloc(struct_size(new_beacon->mbssid_ies, elem, beacon->mbssid_ies->cnt), GFP_KERNEL); if (!new_beacon->mbssid_ies) { kfree(new_beacon); return NULL; } if (beacon->rnr_ies && beacon->rnr_ies->cnt) { new_beacon->rnr_ies = kzalloc(struct_size(new_beacon->rnr_ies, elem, beacon->rnr_ies->cnt), GFP_KERNEL); if (!new_beacon->rnr_ies) { kfree(new_beacon->mbssid_ies); kfree(new_beacon); return NULL; } } } pos = (u8 *)(new_beacon + 1); if (beacon->head_len) { new_beacon->head_len = beacon->head_len; new_beacon->head = pos; memcpy(pos, beacon->head, beacon->head_len); pos += beacon->head_len; } if (beacon->tail_len) { new_beacon->tail_len = beacon->tail_len; new_beacon->tail = pos; memcpy(pos, beacon->tail, beacon->tail_len); pos += beacon->tail_len; } if (beacon->beacon_ies_len) { new_beacon->beacon_ies_len = beacon->beacon_ies_len; new_beacon->beacon_ies = pos; memcpy(pos, beacon->beacon_ies, beacon->beacon_ies_len); pos += beacon->beacon_ies_len; } if (beacon->proberesp_ies_len) { new_beacon->proberesp_ies_len = beacon->proberesp_ies_len; new_beacon->proberesp_ies = pos; memcpy(pos, beacon->proberesp_ies, beacon->proberesp_ies_len); pos += beacon->proberesp_ies_len; } if (beacon->assocresp_ies_len) { new_beacon->assocresp_ies_len = beacon->assocresp_ies_len; new_beacon->assocresp_ies = pos; memcpy(pos, beacon->assocresp_ies, beacon->assocresp_ies_len); pos += beacon->assocresp_ies_len; } if (beacon->probe_resp_len) { new_beacon->probe_resp_len = beacon->probe_resp_len; new_beacon->probe_resp = pos; memcpy(pos, beacon->probe_resp, beacon->probe_resp_len); pos += beacon->probe_resp_len; } if (beacon->mbssid_ies && beacon->mbssid_ies->cnt) { pos += ieee80211_copy_mbssid_beacon(pos, new_beacon->mbssid_ies, beacon->mbssid_ies); if (beacon->rnr_ies && beacon->rnr_ies->cnt) pos += ieee80211_copy_rnr_beacon(pos, new_beacon->rnr_ies, beacon->rnr_ies); } /* might copy -1, meaning no changes requested */ new_beacon->ftm_responder = beacon->ftm_responder; if (beacon->lci) { new_beacon->lci_len = beacon->lci_len; new_beacon->lci = pos; memcpy(pos, beacon->lci, beacon->lci_len); pos += beacon->lci_len; } if (beacon->civicloc) { new_beacon->civicloc_len = beacon->civicloc_len; new_beacon->civicloc = pos; memcpy(pos, beacon->civicloc, beacon->civicloc_len); pos += beacon->civicloc_len; } return new_beacon; } void ieee80211_csa_finish(struct ieee80211_vif *vif, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_local *local = sdata->local; struct ieee80211_bss_conf *tx_bss_conf; struct ieee80211_link_data *link_data; if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS)) return; rcu_read_lock(); link_data = rcu_dereference(sdata->link[link_id]); if (WARN_ON(!link_data)) { rcu_read_unlock(); return; } tx_bss_conf = rcu_dereference(link_data->conf->tx_bss_conf); if (tx_bss_conf == link_data->conf) { /* Trigger ieee80211_csa_finish() on the non-transmitting * interfaces when channel switch is received on * transmitting interface */ struct ieee80211_link_data *iter; for_each_sdata_link(local, iter) { if (iter->sdata == sdata || rcu_access_pointer(iter->conf->tx_bss_conf) != tx_bss_conf) continue; wiphy_work_queue(iter->sdata->local->hw.wiphy, &iter->csa.finalize_work); } } wiphy_work_queue(local->hw.wiphy, &link_data->csa.finalize_work); rcu_read_unlock(); } EXPORT_SYMBOL(ieee80211_csa_finish); void ieee80211_channel_switch_disconnect(struct ieee80211_vif *vif) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; sdata_info(sdata, "channel switch failed, disconnecting\n"); wiphy_work_queue(local->hw.wiphy, &ifmgd->csa_connection_drop_work); } EXPORT_SYMBOL(ieee80211_channel_switch_disconnect); static int ieee80211_set_after_csa_beacon(struct ieee80211_link_data *link_data, u64 *changed) { struct ieee80211_sub_if_data *sdata = link_data->sdata; int err; switch (sdata->vif.type) { case NL80211_IFTYPE_AP: if (!link_data->u.ap.next_beacon) return -EINVAL; err = ieee80211_assign_beacon(sdata, link_data, link_data->u.ap.next_beacon, NULL, NULL, changed); ieee80211_free_next_beacon(link_data); if (err < 0) return err; break; case NL80211_IFTYPE_ADHOC: err = ieee80211_ibss_finish_csa(sdata, changed); if (err < 0) return err; break; #ifdef CONFIG_MAC80211_MESH case NL80211_IFTYPE_MESH_POINT: err = ieee80211_mesh_finish_csa(sdata, changed); if (err < 0) return err; break; #endif default: WARN_ON(1); return -EINVAL; } return 0; } static int __ieee80211_csa_finalize(struct ieee80211_link_data *link_data) { struct ieee80211_sub_if_data *sdata = link_data->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_bss_conf *link_conf = link_data->conf; u64 changed = 0; int err; lockdep_assert_wiphy(local->hw.wiphy); /* * using reservation isn't immediate as it may be deferred until later * with multi-vif. once reservation is complete it will re-schedule the * work with no reserved_chanctx so verify chandef to check if it * completed successfully */ if (link_data->reserved_chanctx) { /* * with multi-vif csa driver may call ieee80211_csa_finish() * many times while waiting for other interfaces to use their * reservations */ if (link_data->reserved_ready) return 0; return ieee80211_link_use_reserved_context(link_data); } if (!cfg80211_chandef_identical(&link_conf->chanreq.oper, &link_data->csa.chanreq.oper)) return -EINVAL; link_conf->csa_active = false; err = ieee80211_set_after_csa_beacon(link_data, &changed); if (err) return err; ieee80211_link_info_change_notify(sdata, link_data, changed); ieee80211_vif_unblock_queues_csa(sdata); err = drv_post_channel_switch(link_data); if (err) return err; cfg80211_ch_switch_notify(sdata->dev, &link_data->csa.chanreq.oper, link_data->link_id); return 0; } static void ieee80211_csa_finalize(struct ieee80211_link_data *link_data) { struct ieee80211_sub_if_data *sdata = link_data->sdata; if (__ieee80211_csa_finalize(link_data)) { sdata_info(sdata, "failed to finalize CSA on link %d, disconnecting\n", link_data->link_id); cfg80211_stop_iface(sdata->local->hw.wiphy, &sdata->wdev, GFP_KERNEL); } } void ieee80211_csa_finalize_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_link_data *link = container_of(work, struct ieee80211_link_data, csa.finalize_work); struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; lockdep_assert_wiphy(local->hw.wiphy); /* AP might have been stopped while waiting for the lock. */ if (!link->conf->csa_active) return; if (!ieee80211_sdata_running(sdata)) return; ieee80211_csa_finalize(link); } static int ieee80211_set_csa_beacon(struct ieee80211_link_data *link_data, struct cfg80211_csa_settings *params, u64 *changed) { struct ieee80211_sub_if_data *sdata = link_data->sdata; struct ieee80211_csa_settings csa = {}; int err; switch (sdata->vif.type) { case NL80211_IFTYPE_AP: link_data->u.ap.next_beacon = cfg80211_beacon_dup(&params->beacon_after); if (!link_data->u.ap.next_beacon) return -ENOMEM; /* * With a count of 0, we don't have to wait for any * TBTT before switching, so complete the CSA * immediately. In theory, with a count == 1 we * should delay the switch until just before the next * TBTT, but that would complicate things so we switch * immediately too. If we would delay the switch * until the next TBTT, we would have to set the probe * response here. * * TODO: A channel switch with count <= 1 without * sending a CSA action frame is kind of useless, * because the clients won't know we're changing * channels. The action frame must be implemented * either here or in the userspace. */ if (params->count <= 1) break; if ((params->n_counter_offsets_beacon > IEEE80211_MAX_CNTDWN_COUNTERS_NUM) || (params->n_counter_offsets_presp > IEEE80211_MAX_CNTDWN_COUNTERS_NUM)) { ieee80211_free_next_beacon(link_data); return -EINVAL; } csa.counter_offsets_beacon = params->counter_offsets_beacon; csa.counter_offsets_presp = params->counter_offsets_presp; csa.n_counter_offsets_beacon = params->n_counter_offsets_beacon; csa.n_counter_offsets_presp = params->n_counter_offsets_presp; csa.count = params->count; err = ieee80211_assign_beacon(sdata, link_data, &params->beacon_csa, &csa, NULL, changed); if (err < 0) { ieee80211_free_next_beacon(link_data); return err; } break; case NL80211_IFTYPE_ADHOC: if (!sdata->vif.cfg.ibss_joined) return -EINVAL; if (params->chandef.width != sdata->u.ibss.chandef.width) return -EINVAL; switch (params->chandef.width) { case NL80211_CHAN_WIDTH_40: if (cfg80211_get_chandef_type(&params->chandef) != cfg80211_get_chandef_type(&sdata->u.ibss.chandef)) return -EINVAL; break; case NL80211_CHAN_WIDTH_5: case NL80211_CHAN_WIDTH_10: case NL80211_CHAN_WIDTH_20_NOHT: case NL80211_CHAN_WIDTH_20: break; default: return -EINVAL; } /* changes into another band are not supported */ if (sdata->u.ibss.chandef.chan->band != params->chandef.chan->band) return -EINVAL; /* see comments in the NL80211_IFTYPE_AP block */ if (params->count > 1) { err = ieee80211_ibss_csa_beacon(sdata, params, changed); if (err < 0) return err; } ieee80211_send_action_csa(sdata, params); break; #ifdef CONFIG_MAC80211_MESH case NL80211_IFTYPE_MESH_POINT: { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; /* changes into another band are not supported */ if (sdata->vif.bss_conf.chanreq.oper.chan->band != params->chandef.chan->band) return -EINVAL; if (ifmsh->csa_role == IEEE80211_MESH_CSA_ROLE_NONE) { ifmsh->csa_role = IEEE80211_MESH_CSA_ROLE_INIT; if (!ifmsh->pre_value) ifmsh->pre_value = 1; else ifmsh->pre_value++; } /* see comments in the NL80211_IFTYPE_AP block */ if (params->count > 1) { err = ieee80211_mesh_csa_beacon(sdata, params, changed); if (err < 0) { ifmsh->csa_role = IEEE80211_MESH_CSA_ROLE_NONE; return err; } } if (ifmsh->csa_role == IEEE80211_MESH_CSA_ROLE_INIT) ieee80211_send_action_csa(sdata, params); break; } #endif default: return -EOPNOTSUPP; } return 0; } static void ieee80211_color_change_abort(struct ieee80211_link_data *link) { link->conf->color_change_active = false; ieee80211_free_next_beacon(link); cfg80211_color_change_aborted_notify(link->sdata->dev, link->link_id); } static int __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_csa_settings *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_chan_req chanreq = { .oper = params->chandef }; struct ieee80211_local *local = sdata->local; struct ieee80211_channel_switch ch_switch = { .link_id = params->link_id, }; struct ieee80211_chanctx_conf *conf; struct ieee80211_chanctx *chanctx; struct ieee80211_bss_conf *link_conf; struct ieee80211_link_data *link_data; u64 changed = 0; u8 link_id = params->link_id; int err; lockdep_assert_wiphy(local->hw.wiphy); if (!list_empty(&local->roc_list) || local->scanning) return -EBUSY; if (sdata->wdev.links[link_id].cac_started) return -EBUSY; if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS)) return -EINVAL; link_data = wiphy_dereference(wiphy, sdata->link[link_id]); if (!link_data) return -ENOLINK; link_conf = link_data->conf; if (chanreq.oper.punctured && !link_conf->eht_support) return -EINVAL; /* don't allow another channel switch if one is already active. */ if (link_conf->csa_active) return -EBUSY; conf = wiphy_dereference(wiphy, link_conf->chanctx_conf); if (!conf) { err = -EBUSY; goto out; } if (params->chandef.chan->freq_offset) { /* this may work, but is untested */ err = -EOPNOTSUPP; goto out; } chanctx = container_of(conf, struct ieee80211_chanctx, conf); ch_switch.timestamp = 0; ch_switch.device_timestamp = 0; ch_switch.block_tx = params->block_tx; ch_switch.chandef = chanreq.oper; ch_switch.count = params->count; err = drv_pre_channel_switch(sdata, &ch_switch); if (err) goto out; err = ieee80211_link_reserve_chanctx(link_data, &chanreq, chanctx->mode, params->radar_required); if (err) goto out; /* if reservation is invalid then this will fail */ err = ieee80211_check_combinations(sdata, NULL, chanctx->mode, 0, -1); if (err) { ieee80211_link_unreserve_chanctx(link_data); goto out; } /* if there is a color change in progress, abort it */ if (link_conf->color_change_active) ieee80211_color_change_abort(link_data); err = ieee80211_set_csa_beacon(link_data, params, &changed); if (err) { ieee80211_link_unreserve_chanctx(link_data); goto out; } link_data->csa.chanreq = chanreq; link_conf->csa_active = true; if (params->block_tx) ieee80211_vif_block_queues_csa(sdata); cfg80211_ch_switch_started_notify(sdata->dev, &link_data->csa.chanreq.oper, link_id, params->count, params->block_tx); if (changed) { ieee80211_link_info_change_notify(sdata, link_data, changed); drv_channel_switch_beacon(sdata, &link_data->csa.chanreq.oper); } else { /* if the beacon didn't change, we can finalize immediately */ ieee80211_csa_finalize(link_data); } out: return err; } int ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_csa_settings *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; lockdep_assert_wiphy(local->hw.wiphy); return __ieee80211_channel_switch(wiphy, dev, params); } u64 ieee80211_mgmt_tx_cookie(struct ieee80211_local *local) { lockdep_assert_wiphy(local->hw.wiphy); local->roc_cookie_counter++; /* wow, you wrapped 64 bits ... more likely a bug */ if (WARN_ON(local->roc_cookie_counter == 0)) local->roc_cookie_counter++; return local->roc_cookie_counter; } int ieee80211_attach_ack_skb(struct ieee80211_local *local, struct sk_buff *skb, u64 *cookie, gfp_t gfp) { unsigned long spin_flags; struct sk_buff *ack_skb; int id; ack_skb = skb_copy(skb, gfp); if (!ack_skb) return -ENOMEM; spin_lock_irqsave(&local->ack_status_lock, spin_flags); id = idr_alloc(&local->ack_status_frames, ack_skb, 1, 0x2000, GFP_ATOMIC); spin_unlock_irqrestore(&local->ack_status_lock, spin_flags); if (id < 0) { kfree_skb(ack_skb); return -ENOMEM; } IEEE80211_SKB_CB(skb)->status_data_idr = 1; IEEE80211_SKB_CB(skb)->status_data = id; *cookie = ieee80211_mgmt_tx_cookie(local); IEEE80211_SKB_CB(ack_skb)->ack.cookie = *cookie; return 0; } static void ieee80211_update_mgmt_frame_registrations(struct wiphy *wiphy, struct wireless_dev *wdev, struct mgmt_frame_regs *upd) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); u32 preq_mask = BIT(IEEE80211_STYPE_PROBE_REQ >> 4); u32 action_mask = BIT(IEEE80211_STYPE_ACTION >> 4); bool global_change, intf_change; global_change = (local->probe_req_reg != !!(upd->global_stypes & preq_mask)) || (local->rx_mcast_action_reg != !!(upd->global_mcast_stypes & action_mask)); local->probe_req_reg = upd->global_stypes & preq_mask; local->rx_mcast_action_reg = upd->global_mcast_stypes & action_mask; intf_change = (sdata->vif.probe_req_reg != !!(upd->interface_stypes & preq_mask)) || (sdata->vif.rx_mcast_action_reg != !!(upd->interface_mcast_stypes & action_mask)); sdata->vif.probe_req_reg = upd->interface_stypes & preq_mask; sdata->vif.rx_mcast_action_reg = upd->interface_mcast_stypes & action_mask; if (!local->open_count) return; if (intf_change && ieee80211_sdata_running(sdata)) drv_config_iface_filter(local, sdata, sdata->vif.probe_req_reg ? FIF_PROBE_REQ : 0, FIF_PROBE_REQ); if (global_change) ieee80211_configure_filter(local); } static int ieee80211_set_antenna(struct wiphy *wiphy, u32 tx_ant, u32 rx_ant) { struct ieee80211_local *local = wiphy_priv(wiphy); int ret; if (local->started) return -EOPNOTSUPP; ret = drv_set_antenna(local, tx_ant, rx_ant); if (ret) return ret; local->rx_chains = hweight8(rx_ant); return 0; } static int ieee80211_get_antenna(struct wiphy *wiphy, u32 *tx_ant, u32 *rx_ant) { struct ieee80211_local *local = wiphy_priv(wiphy); return drv_get_antenna(local, tx_ant, rx_ant); } static int ieee80211_set_rekey_data(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_gtk_rekey_data *data) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); if (!local->ops->set_rekey_data) return -EOPNOTSUPP; drv_set_rekey_data(local, sdata, data); return 0; } static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev, const u8 *peer, u64 *cookie) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct ieee80211_qos_hdr *nullfunc; struct sk_buff *skb; int size = sizeof(*nullfunc); __le16 fc; bool qos; struct ieee80211_tx_info *info; struct sta_info *sta; struct ieee80211_chanctx_conf *chanctx_conf; enum nl80211_band band; int ret; /* the lock is needed to assign the cookie later */ lockdep_assert_wiphy(local->hw.wiphy); rcu_read_lock(); sta = sta_info_get_bss(sdata, peer); if (!sta) { ret = -ENOLINK; goto unlock; } qos = sta->sta.wme; chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); if (WARN_ON(!chanctx_conf)) { ret = -EINVAL; goto unlock; } band = chanctx_conf->def.chan->band; if (qos) { fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_QOS_NULLFUNC | IEEE80211_FCTL_FROMDS); } else { size -= 2; fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_NULLFUNC | IEEE80211_FCTL_FROMDS); } skb = dev_alloc_skb(local->hw.extra_tx_headroom + size); if (!skb) { ret = -ENOMEM; goto unlock; } skb->dev = dev; skb_reserve(skb, local->hw.extra_tx_headroom); nullfunc = skb_put(skb, size); nullfunc->frame_control = fc; nullfunc->duration_id = 0; memcpy(nullfunc->addr1, sta->sta.addr, ETH_ALEN); memcpy(nullfunc->addr2, sdata->vif.addr, ETH_ALEN); memcpy(nullfunc->addr3, sdata->vif.addr, ETH_ALEN); nullfunc->seq_ctrl = 0; info = IEEE80211_SKB_CB(skb); info->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS | IEEE80211_TX_INTFL_NL80211_FRAME_TX; info->band = band; skb_set_queue_mapping(skb, IEEE80211_AC_VO); skb->priority = 7; if (qos) nullfunc->qos_ctrl = cpu_to_le16(7); ret = ieee80211_attach_ack_skb(local, skb, cookie, GFP_ATOMIC); if (ret) { kfree_skb(skb); goto unlock; } local_bh_disable(); ieee80211_xmit(sdata, sta, skb); local_bh_enable(); ret = 0; unlock: rcu_read_unlock(); return ret; } static int ieee80211_cfg_get_channel(struct wiphy *wiphy, struct wireless_dev *wdev, unsigned int link_id, struct cfg80211_chan_def *chandef) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_link_data *link; int ret = -ENODATA; rcu_read_lock(); link = rcu_dereference(sdata->link[link_id]); if (!link) { ret = -ENOLINK; goto out; } chanctx_conf = rcu_dereference(link->conf->chanctx_conf); if (chanctx_conf) { *chandef = link->conf->chanreq.oper; ret = 0; } else if (local->open_count > 0 && local->open_count == local->virt_monitors && sdata->vif.type == NL80211_IFTYPE_MONITOR) { *chandef = local->monitor_chanreq.oper; ret = 0; } out: rcu_read_unlock(); return ret; } #ifdef CONFIG_PM static void ieee80211_set_wakeup(struct wiphy *wiphy, bool enabled) { drv_set_wakeup(wiphy_priv(wiphy), enabled); } #endif static int ieee80211_set_qos_map(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_qos_map *qos_map) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct mac80211_qos_map *new_qos_map, *old_qos_map; if (qos_map) { new_qos_map = kzalloc(sizeof(*new_qos_map), GFP_KERNEL); if (!new_qos_map) return -ENOMEM; memcpy(&new_qos_map->qos_map, qos_map, sizeof(*qos_map)); } else { /* A NULL qos_map was passed to disable QoS mapping */ new_qos_map = NULL; } old_qos_map = sdata_dereference(sdata->qos_map, sdata); rcu_assign_pointer(sdata->qos_map, new_qos_map); if (old_qos_map) kfree_rcu(old_qos_map, rcu_head); return 0; } static int ieee80211_set_ap_chanwidth(struct wiphy *wiphy, struct net_device *dev, unsigned int link_id, struct cfg80211_chan_def *chandef) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_link_data *link; struct ieee80211_chan_req chanreq = { .oper = *chandef }; int ret; u64 changed = 0; link = sdata_dereference(sdata->link[link_id], sdata); ret = ieee80211_link_change_chanreq(link, &chanreq, &changed); if (ret == 0) ieee80211_link_info_change_notify(sdata, link, changed); return ret; } static int ieee80211_add_tx_ts(struct wiphy *wiphy, struct net_device *dev, u8 tsid, const u8 *peer, u8 up, u16 admitted_time) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; int ac = ieee802_1d_to_ac[up]; if (sdata->vif.type != NL80211_IFTYPE_STATION) return -EOPNOTSUPP; if (!(sdata->wmm_acm & BIT(up))) return -EINVAL; if (ifmgd->tx_tspec[ac].admitted_time) return -EBUSY; if (admitted_time) { ifmgd->tx_tspec[ac].admitted_time = 32 * admitted_time; ifmgd->tx_tspec[ac].tsid = tsid; ifmgd->tx_tspec[ac].up = up; } return 0; } static int ieee80211_del_tx_ts(struct wiphy *wiphy, struct net_device *dev, u8 tsid, const u8 *peer) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = wiphy_priv(wiphy); int ac; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { struct ieee80211_sta_tx_tspec *tx_tspec = &ifmgd->tx_tspec[ac]; /* skip unused entries */ if (!tx_tspec->admitted_time) continue; if (tx_tspec->tsid != tsid) continue; /* due to this new packets will be reassigned to non-ACM ACs */ tx_tspec->up = -1; /* Make sure that all packets have been sent to avoid to * restore the QoS params on packets that are still on the * queues. */ synchronize_net(); ieee80211_flush_queues(local, sdata, false); /* restore the normal QoS parameters * (unconditionally to avoid races) */ tx_tspec->action = TX_TSPEC_ACTION_STOP_DOWNGRADE; tx_tspec->downgraded = false; ieee80211_sta_handle_tspec_ac_params(sdata); /* finally clear all the data */ memset(tx_tspec, 0, sizeof(*tx_tspec)); return 0; } return -ENOENT; } void ieee80211_nan_func_terminated(struct ieee80211_vif *vif, u8 inst_id, enum nl80211_nan_func_term_reason reason, gfp_t gfp) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct cfg80211_nan_func *func; u64 cookie; if (WARN_ON(vif->type != NL80211_IFTYPE_NAN)) return; spin_lock_bh(&sdata->u.nan.func_lock); func = idr_find(&sdata->u.nan.function_inst_ids, inst_id); if (WARN_ON(!func)) { spin_unlock_bh(&sdata->u.nan.func_lock); return; } cookie = func->cookie; idr_remove(&sdata->u.nan.function_inst_ids, inst_id); spin_unlock_bh(&sdata->u.nan.func_lock); cfg80211_free_nan_func(func); cfg80211_nan_func_terminated(ieee80211_vif_to_wdev(vif), inst_id, reason, cookie, gfp); } EXPORT_SYMBOL(ieee80211_nan_func_terminated); void ieee80211_nan_func_match(struct ieee80211_vif *vif, struct cfg80211_nan_match_params *match, gfp_t gfp) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct cfg80211_nan_func *func; if (WARN_ON(vif->type != NL80211_IFTYPE_NAN)) return; spin_lock_bh(&sdata->u.nan.func_lock); func = idr_find(&sdata->u.nan.function_inst_ids, match->inst_id); if (WARN_ON(!func)) { spin_unlock_bh(&sdata->u.nan.func_lock); return; } match->cookie = func->cookie; spin_unlock_bh(&sdata->u.nan.func_lock); cfg80211_nan_match(ieee80211_vif_to_wdev(vif), match, gfp); } EXPORT_SYMBOL(ieee80211_nan_func_match); static int ieee80211_set_multicast_to_unicast(struct wiphy *wiphy, struct net_device *dev, const bool enabled) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); sdata->u.ap.multicast_to_unicast = enabled; return 0; } void ieee80211_fill_txq_stats(struct cfg80211_txq_stats *txqstats, struct txq_info *txqi) { if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_BACKLOG_BYTES))) { txqstats->filled |= BIT(NL80211_TXQ_STATS_BACKLOG_BYTES); txqstats->backlog_bytes = txqi->tin.backlog_bytes; } if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_BACKLOG_PACKETS))) { txqstats->filled |= BIT(NL80211_TXQ_STATS_BACKLOG_PACKETS); txqstats->backlog_packets = txqi->tin.backlog_packets; } if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_FLOWS))) { txqstats->filled |= BIT(NL80211_TXQ_STATS_FLOWS); txqstats->flows = txqi->tin.flows; } if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_DROPS))) { txqstats->filled |= BIT(NL80211_TXQ_STATS_DROPS); txqstats->drops = txqi->cstats.drop_count; } if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_ECN_MARKS))) { txqstats->filled |= BIT(NL80211_TXQ_STATS_ECN_MARKS); txqstats->ecn_marks = txqi->cstats.ecn_mark; } if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_OVERLIMIT))) { txqstats->filled |= BIT(NL80211_TXQ_STATS_OVERLIMIT); txqstats->overlimit = txqi->tin.overlimit; } if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_COLLISIONS))) { txqstats->filled |= BIT(NL80211_TXQ_STATS_COLLISIONS); txqstats->collisions = txqi->tin.collisions; } if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_TX_BYTES))) { txqstats->filled |= BIT(NL80211_TXQ_STATS_TX_BYTES); txqstats->tx_bytes = txqi->tin.tx_bytes; } if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_TX_PACKETS))) { txqstats->filled |= BIT(NL80211_TXQ_STATS_TX_PACKETS); txqstats->tx_packets = txqi->tin.tx_packets; } } static int ieee80211_get_txq_stats(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_txq_stats *txqstats) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata; int ret = 0; spin_lock_bh(&local->fq.lock); rcu_read_lock(); if (wdev) { sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); if (!sdata->vif.txq) { ret = 1; goto out; } ieee80211_fill_txq_stats(txqstats, to_txq_info(sdata->vif.txq)); } else { /* phy stats */ txqstats->filled |= BIT(NL80211_TXQ_STATS_BACKLOG_PACKETS) | BIT(NL80211_TXQ_STATS_BACKLOG_BYTES) | BIT(NL80211_TXQ_STATS_OVERLIMIT) | BIT(NL80211_TXQ_STATS_OVERMEMORY) | BIT(NL80211_TXQ_STATS_COLLISIONS) | BIT(NL80211_TXQ_STATS_MAX_FLOWS); txqstats->backlog_packets = local->fq.backlog; txqstats->backlog_bytes = local->fq.memory_usage; txqstats->overlimit = local->fq.overlimit; txqstats->overmemory = local->fq.overmemory; txqstats->collisions = local->fq.collisions; txqstats->max_flows = local->fq.flows_cnt; } out: rcu_read_unlock(); spin_unlock_bh(&local->fq.lock); return ret; } static int ieee80211_get_ftm_responder_stats(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_ftm_responder_stats *ftm_stats) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); return drv_get_ftm_responder_stats(local, sdata, ftm_stats); } static int ieee80211_start_pmsr(struct wiphy *wiphy, struct wireless_dev *dev, struct cfg80211_pmsr_request *request) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(dev); return drv_start_pmsr(local, sdata, request); } static void ieee80211_abort_pmsr(struct wiphy *wiphy, struct wireless_dev *dev, struct cfg80211_pmsr_request *request) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(dev); return drv_abort_pmsr(local, sdata, request); } static int ieee80211_set_tid_config(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_tid_config *tid_conf) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct sta_info *sta; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (!sdata->local->ops->set_tid_config) return -EOPNOTSUPP; if (!tid_conf->peer) return drv_set_tid_config(sdata->local, sdata, NULL, tid_conf); sta = sta_info_get_bss(sdata, tid_conf->peer); if (!sta) return -ENOENT; return drv_set_tid_config(sdata->local, sdata, &sta->sta, tid_conf); } static int ieee80211_reset_tid_config(struct wiphy *wiphy, struct net_device *dev, const u8 *peer, u8 tids) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct sta_info *sta; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (!sdata->local->ops->reset_tid_config) return -EOPNOTSUPP; if (!peer) return drv_reset_tid_config(sdata->local, sdata, NULL, tids); sta = sta_info_get_bss(sdata, peer); if (!sta) return -ENOENT; return drv_reset_tid_config(sdata->local, sdata, &sta->sta, tids); } static int ieee80211_set_sar_specs(struct wiphy *wiphy, struct cfg80211_sar_specs *sar) { struct ieee80211_local *local = wiphy_priv(wiphy); if (!local->ops->set_sar_specs) return -EOPNOTSUPP; return local->ops->set_sar_specs(&local->hw, sar); } static int ieee80211_set_after_color_change_beacon(struct ieee80211_link_data *link, u64 *changed) { struct ieee80211_sub_if_data *sdata = link->sdata; switch (sdata->vif.type) { case NL80211_IFTYPE_AP: { int ret; if (!link->u.ap.next_beacon) return -EINVAL; ret = ieee80211_assign_beacon(sdata, link, link->u.ap.next_beacon, NULL, NULL, changed); ieee80211_free_next_beacon(link); if (ret < 0) return ret; break; } default: WARN_ON_ONCE(1); return -EINVAL; } return 0; } static int ieee80211_set_color_change_beacon(struct ieee80211_link_data *link, struct cfg80211_color_change_settings *params, u64 *changed) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_color_change_settings color_change = {}; int err; switch (sdata->vif.type) { case NL80211_IFTYPE_AP: link->u.ap.next_beacon = cfg80211_beacon_dup(&params->beacon_next); if (!link->u.ap.next_beacon) return -ENOMEM; if (params->count <= 1) break; color_change.counter_offset_beacon = params->counter_offset_beacon; color_change.counter_offset_presp = params->counter_offset_presp; color_change.count = params->count; err = ieee80211_assign_beacon(sdata, link, &params->beacon_color_change, NULL, &color_change, changed); if (err < 0) { ieee80211_free_next_beacon(link); return err; } break; default: return -EOPNOTSUPP; } return 0; } static void ieee80211_color_change_bss_config_notify(struct ieee80211_link_data *link, u8 color, int enable, u64 changed) { struct ieee80211_sub_if_data *sdata = link->sdata; lockdep_assert_wiphy(sdata->local->hw.wiphy); link->conf->he_bss_color.color = color; link->conf->he_bss_color.enabled = enable; changed |= BSS_CHANGED_HE_BSS_COLOR; ieee80211_link_info_change_notify(sdata, link, changed); if (!link->conf->nontransmitted && rcu_access_pointer(link->conf->tx_bss_conf)) { struct ieee80211_link_data *tmp; for_each_sdata_link(sdata->local, tmp) { if (tmp->sdata == sdata || rcu_access_pointer(tmp->conf->tx_bss_conf) != link->conf) continue; tmp->conf->he_bss_color.color = color; tmp->conf->he_bss_color.enabled = enable; ieee80211_link_info_change_notify(tmp->sdata, tmp, BSS_CHANGED_HE_BSS_COLOR); } } } static int ieee80211_color_change_finalize(struct ieee80211_link_data *link) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; u64 changed = 0; int err; lockdep_assert_wiphy(local->hw.wiphy); link->conf->color_change_active = false; err = ieee80211_set_after_color_change_beacon(link, &changed); if (err) { cfg80211_color_change_aborted_notify(sdata->dev, link->link_id); return err; } ieee80211_color_change_bss_config_notify(link, link->conf->color_change_color, 1, changed); cfg80211_color_change_notify(sdata->dev, link->link_id); return 0; } void ieee80211_color_change_finalize_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_link_data *link = container_of(work, struct ieee80211_link_data, color_change_finalize_work); struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_bss_conf *link_conf = link->conf; struct ieee80211_local *local = sdata->local; lockdep_assert_wiphy(local->hw.wiphy); /* AP might have been stopped while waiting for the lock. */ if (!link_conf->color_change_active) return; if (!ieee80211_sdata_running(sdata)) return; ieee80211_color_change_finalize(link); } void ieee80211_color_collision_detection_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_link_data *link = container_of(work, struct ieee80211_link_data, color_collision_detect_work.work); struct ieee80211_sub_if_data *sdata = link->sdata; cfg80211_obss_color_collision_notify(sdata->dev, link->color_bitmap, link->link_id); } void ieee80211_color_change_finish(struct ieee80211_vif *vif, u8 link_id) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_link_data *link; if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS)) return; rcu_read_lock(); link = rcu_dereference(sdata->link[link_id]); if (WARN_ON(!link)) { rcu_read_unlock(); return; } wiphy_work_queue(sdata->local->hw.wiphy, &link->color_change_finalize_work); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ieee80211_color_change_finish); void ieee80211_obss_color_collision_notify(struct ieee80211_vif *vif, u64 color_bitmap, u8 link_id) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_link_data *link; if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS)) return; rcu_read_lock(); link = rcu_dereference(sdata->link[link_id]); if (WARN_ON(!link)) { rcu_read_unlock(); return; } if (link->conf->color_change_active || link->conf->csa_active) { rcu_read_unlock(); return; } if (wiphy_delayed_work_pending(sdata->local->hw.wiphy, &link->color_collision_detect_work)) { rcu_read_unlock(); return; } link->color_bitmap = color_bitmap; /* queue the color collision detection event every 500 ms in order to * avoid sending too much netlink messages to userspace. */ wiphy_delayed_work_queue(sdata->local->hw.wiphy, &link->color_collision_detect_work, msecs_to_jiffies(500)); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ieee80211_obss_color_collision_notify); static int ieee80211_color_change(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_color_change_settings *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct ieee80211_bss_conf *link_conf; struct ieee80211_link_data *link; u8 link_id = params->link_id; u64 changed = 0; int err; lockdep_assert_wiphy(local->hw.wiphy); if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS)) return -EINVAL; link = wiphy_dereference(wiphy, sdata->link[link_id]); if (!link) return -ENOLINK; link_conf = link->conf; if (link_conf->nontransmitted) return -EINVAL; /* don't allow another color change if one is already active or if csa * is active */ if (link_conf->color_change_active || link_conf->csa_active) { err = -EBUSY; goto out; } err = ieee80211_set_color_change_beacon(link, params, &changed); if (err) goto out; link_conf->color_change_active = true; link_conf->color_change_color = params->color; cfg80211_color_change_started_notify(sdata->dev, params->count, link_id); if (changed) ieee80211_color_change_bss_config_notify(link, 0, 0, changed); else /* if the beacon didn't change, we can finalize immediately */ ieee80211_color_change_finalize(link); out: return err; } static int ieee80211_set_radar_background(struct wiphy *wiphy, struct cfg80211_chan_def *chandef) { struct ieee80211_local *local = wiphy_priv(wiphy); if (!local->ops->set_radar_background) return -EOPNOTSUPP; return local->ops->set_radar_background(&local->hw, chandef); } static int ieee80211_add_intf_link(struct wiphy *wiphy, struct wireless_dev *wdev, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); lockdep_assert_wiphy(sdata->local->hw.wiphy); if (wdev->use_4addr) return -EOPNOTSUPP; return ieee80211_vif_set_links(sdata, wdev->valid_links, 0); } static void ieee80211_del_intf_link(struct wiphy *wiphy, struct wireless_dev *wdev, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); u16 new_links = wdev->valid_links & ~BIT(link_id); lockdep_assert_wiphy(sdata->local->hw.wiphy); /* During the link teardown process, certain functions require the * link_id to remain in the valid_links bitmap. Therefore, instead * of removing the link_id from the bitmap, pass a masked value to * simulate as if link_id does not exist anymore. */ ieee80211_vif_set_links(sdata, new_links, 0); } static int ieee80211_add_link_station(struct wiphy *wiphy, struct net_device *dev, struct link_station_parameters *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = wiphy_priv(wiphy); struct sta_info *sta; int ret; lockdep_assert_wiphy(local->hw.wiphy); sta = sta_info_get_bss(sdata, params->mld_mac); if (!sta) return -ENOENT; if (!sta->sta.valid_links) return -EINVAL; if (sta->sta.valid_links & BIT(params->link_id)) return -EALREADY; ret = ieee80211_sta_allocate_link(sta, params->link_id); if (ret) return ret; ret = sta_link_apply_parameters(local, sta, STA_LINK_MODE_NEW, params); if (ret) { ieee80211_sta_free_link(sta, params->link_id); return ret; } if (test_sta_flag(sta, WLAN_STA_ASSOC)) { struct link_sta_info *link_sta; link_sta = sdata_dereference(sta->link[params->link_id], sdata); rate_control_rate_init(link_sta); } /* ieee80211_sta_activate_link frees the link upon failure */ return ieee80211_sta_activate_link(sta, params->link_id); } static int ieee80211_mod_link_station(struct wiphy *wiphy, struct net_device *dev, struct link_station_parameters *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = wiphy_priv(wiphy); struct sta_info *sta; lockdep_assert_wiphy(local->hw.wiphy); sta = sta_info_get_bss(sdata, params->mld_mac); if (!sta) return -ENOENT; if (!(sta->sta.valid_links & BIT(params->link_id))) return -EINVAL; return sta_link_apply_parameters(local, sta, STA_LINK_MODE_LINK_MODIFY, params); } static int ieee80211_del_link_station(struct wiphy *wiphy, struct net_device *dev, struct link_station_del_parameters *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct sta_info *sta; lockdep_assert_wiphy(sdata->local->hw.wiphy); sta = sta_info_get_bss(sdata, params->mld_mac); if (!sta) return -ENOENT; if (!(sta->sta.valid_links & BIT(params->link_id))) return -EINVAL; /* must not create a STA without links */ if (sta->sta.valid_links == BIT(params->link_id)) return -EINVAL; ieee80211_sta_remove_link(sta, params->link_id); return 0; } static int ieee80211_set_hw_timestamp(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_set_hw_timestamp *hwts) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; if (!local->ops->set_hw_timestamp) return -EOPNOTSUPP; if (!check_sdata_in_driver(sdata)) return -EIO; return local->ops->set_hw_timestamp(&local->hw, &sdata->vif, hwts); } static int ieee80211_set_ttlm(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_ttlm_params *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); lockdep_assert_wiphy(sdata->local->hw.wiphy); return ieee80211_req_neg_ttlm(sdata, params); } static int ieee80211_assoc_ml_reconf(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_ml_reconf_req *req) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); lockdep_assert_wiphy(sdata->local->hw.wiphy); return ieee80211_mgd_assoc_ml_reconf(sdata, req); } static int ieee80211_set_epcs(struct wiphy *wiphy, struct net_device *dev, bool enable) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); return ieee80211_mgd_set_epcs(sdata, enable); } const struct cfg80211_ops mac80211_config_ops = { .add_virtual_intf = ieee80211_add_iface, .del_virtual_intf = ieee80211_del_iface, .change_virtual_intf = ieee80211_change_iface, .start_p2p_device = ieee80211_start_p2p_device, .stop_p2p_device = ieee80211_stop_p2p_device, .add_key = ieee80211_add_key, .del_key = ieee80211_del_key, .get_key = ieee80211_get_key, .set_default_key = ieee80211_config_default_key, .set_default_mgmt_key = ieee80211_config_default_mgmt_key, .set_default_beacon_key = ieee80211_config_default_beacon_key, .start_ap = ieee80211_start_ap, .change_beacon = ieee80211_change_beacon, .stop_ap = ieee80211_stop_ap, .add_station = ieee80211_add_station, .del_station = ieee80211_del_station, .change_station = ieee80211_change_station, .get_station = ieee80211_get_station, .dump_station = ieee80211_dump_station, .dump_survey = ieee80211_dump_survey, #ifdef CONFIG_MAC80211_MESH .add_mpath = ieee80211_add_mpath, .del_mpath = ieee80211_del_mpath, .change_mpath = ieee80211_change_mpath, .get_mpath = ieee80211_get_mpath, .dump_mpath = ieee80211_dump_mpath, .get_mpp = ieee80211_get_mpp, .dump_mpp = ieee80211_dump_mpp, .update_mesh_config = ieee80211_update_mesh_config, .get_mesh_config = ieee80211_get_mesh_config, .join_mesh = ieee80211_join_mesh, .leave_mesh = ieee80211_leave_mesh, #endif .join_ocb = ieee80211_join_ocb, .leave_ocb = ieee80211_leave_ocb, .change_bss = ieee80211_change_bss, .inform_bss = ieee80211_inform_bss, .set_txq_params = ieee80211_set_txq_params, .set_monitor_channel = ieee80211_set_monitor_channel, .suspend = ieee80211_suspend, .resume = ieee80211_resume, .scan = ieee80211_scan, .abort_scan = ieee80211_abort_scan, .sched_scan_start = ieee80211_sched_scan_start, .sched_scan_stop = ieee80211_sched_scan_stop, .auth = ieee80211_auth, .assoc = ieee80211_assoc, .deauth = ieee80211_deauth, .disassoc = ieee80211_disassoc, .join_ibss = ieee80211_join_ibss, .leave_ibss = ieee80211_leave_ibss, .set_mcast_rate = ieee80211_set_mcast_rate, .set_wiphy_params = ieee80211_set_wiphy_params, .set_tx_power = ieee80211_set_tx_power, .get_tx_power = ieee80211_get_tx_power, .rfkill_poll = ieee80211_rfkill_poll, CFG80211_TESTMODE_CMD(ieee80211_testmode_cmd) CFG80211_TESTMODE_DUMP(ieee80211_testmode_dump) .set_power_mgmt = ieee80211_set_power_mgmt, .set_bitrate_mask = ieee80211_set_bitrate_mask, .remain_on_channel = ieee80211_remain_on_channel, .cancel_remain_on_channel = ieee80211_cancel_remain_on_channel, .mgmt_tx = ieee80211_mgmt_tx, .mgmt_tx_cancel_wait = ieee80211_mgmt_tx_cancel_wait, .set_cqm_rssi_config = ieee80211_set_cqm_rssi_config, .set_cqm_rssi_range_config = ieee80211_set_cqm_rssi_range_config, .update_mgmt_frame_registrations = ieee80211_update_mgmt_frame_registrations, .set_antenna = ieee80211_set_antenna, .get_antenna = ieee80211_get_antenna, .set_rekey_data = ieee80211_set_rekey_data, .tdls_oper = ieee80211_tdls_oper, .tdls_mgmt = ieee80211_tdls_mgmt, .tdls_channel_switch = ieee80211_tdls_channel_switch, .tdls_cancel_channel_switch = ieee80211_tdls_cancel_channel_switch, .probe_client = ieee80211_probe_client, .set_noack_map = ieee80211_set_noack_map, #ifdef CONFIG_PM .set_wakeup = ieee80211_set_wakeup, #endif .get_channel = ieee80211_cfg_get_channel, .start_radar_detection = ieee80211_start_radar_detection, .end_cac = ieee80211_end_cac, .channel_switch = ieee80211_channel_switch, .set_qos_map = ieee80211_set_qos_map, .set_ap_chanwidth = ieee80211_set_ap_chanwidth, .add_tx_ts = ieee80211_add_tx_ts, .del_tx_ts = ieee80211_del_tx_ts, .start_nan = ieee80211_start_nan, .stop_nan = ieee80211_stop_nan, .nan_change_conf = ieee80211_nan_change_conf, .add_nan_func = ieee80211_add_nan_func, .del_nan_func = ieee80211_del_nan_func, .set_multicast_to_unicast = ieee80211_set_multicast_to_unicast, .tx_control_port = ieee80211_tx_control_port, .get_txq_stats = ieee80211_get_txq_stats, .get_ftm_responder_stats = ieee80211_get_ftm_responder_stats, .start_pmsr = ieee80211_start_pmsr, .abort_pmsr = ieee80211_abort_pmsr, .probe_mesh_link = ieee80211_probe_mesh_link, .set_tid_config = ieee80211_set_tid_config, .reset_tid_config = ieee80211_reset_tid_config, .set_sar_specs = ieee80211_set_sar_specs, .color_change = ieee80211_color_change, .set_radar_background = ieee80211_set_radar_background, .add_intf_link = ieee80211_add_intf_link, .del_intf_link = ieee80211_del_intf_link, .add_link_station = ieee80211_add_link_station, .mod_link_station = ieee80211_mod_link_station, .del_link_station = ieee80211_del_link_station, .set_hw_timestamp = ieee80211_set_hw_timestamp, .set_ttlm = ieee80211_set_ttlm, .get_radio_mask = ieee80211_get_radio_mask, .assoc_ml_reconf = ieee80211_assoc_ml_reconf, .set_epcs = ieee80211_set_epcs, };
14 14 3 1 14 2 14 1 14 14 14 14 103 83 83 75 8 83 20 19 1 20 20 2 20 6 72 20 70 4 33 1 33 19 91 91 91 11 2 17 43 37 6 6 6 43 27 28 18 2 13 27 5 7 3 3 6 8 8 5 4 8 1 1 7 3 3 37 37 201 216 23 69 29 16 16 42 42 14 14 28 41 42 39 34 6 4 2 38 36 1 4 42 6 6 5 2 5 2 2 2 2 62 6 1 11 44 1 1 1 1 38 37 33 30 31 31 1 1 3 36 38 2 1 39 1 36 1 1 1 37 37 37 45 45 5 39 42 171 4 167 21 150 171 149 19 19 5 11 2 2 1 46 62 53 1 1 1 1 50 36 12 2 2 11 46 23 4 1 1 1 31 7 3 30 32 3 418 30 30 10 170 10 148 158 157 158 158 150 8 56 47 9 50 1 45 4 25 24 21 28 46 3 44 5 47 2 47 2 41 8 38 11 46 3 48 48 43 5 9 36 30 8 9 1 3 2 2 2 4 2 2 252 289 1 6 4 1 1 3 7 3 2 4 7 281 26 412 89 89 86 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPv6 tunneling device * Linux INET6 implementation * * Authors: * Ville Nuorvala <vnuorval@tcs.hut.fi> * Yasuyuki Kozakai <kozakai@linux-ipv6.org> * * Based on: * linux/net/ipv6/sit.c and linux/net/ipv4/ipip.c * * RFC 2473 */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/sockios.h> #include <linux/icmp.h> #include <linux/if.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/icmpv6.h> #include <linux/init.h> #include <linux/route.h> #include <linux/rtnetlink.h> #include <linux/netfilter_ipv6.h> #include <linux/slab.h> #include <linux/hash.h> #include <linux/etherdevice.h> #include <linux/uaccess.h> #include <linux/atomic.h> #include <net/icmp.h> #include <net/ip.h> #include <net/ip_tunnels.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/ip6_tunnel.h> #include <net/xfrm.h> #include <net/dsfield.h> #include <net/inet_ecn.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/netdev_lock.h> #include <net/dst_metadata.h> #include <net/inet_dscp.h> MODULE_AUTHOR("Ville Nuorvala"); MODULE_DESCRIPTION("IPv6 tunneling device"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("ip6tnl"); MODULE_ALIAS_NETDEV("ip6tnl0"); #define IP6_TUNNEL_HASH_SIZE_SHIFT 5 #define IP6_TUNNEL_HASH_SIZE (1 << IP6_TUNNEL_HASH_SIZE_SHIFT) static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); static u32 HASH(const struct in6_addr *addr1, const struct in6_addr *addr2) { u32 hash = ipv6_addr_hash(addr1) ^ ipv6_addr_hash(addr2); return hash_32(hash, IP6_TUNNEL_HASH_SIZE_SHIFT); } static int ip6_tnl_dev_init(struct net_device *dev); static void ip6_tnl_dev_setup(struct net_device *dev); static struct rtnl_link_ops ip6_link_ops __read_mostly; static unsigned int ip6_tnl_net_id __read_mostly; struct ip6_tnl_net { /* the IPv6 tunnel fallback device */ struct net_device *fb_tnl_dev; /* lists for storing tunnels in use */ struct ip6_tnl __rcu *tnls_r_l[IP6_TUNNEL_HASH_SIZE]; struct ip6_tnl __rcu *tnls_wc[1]; struct ip6_tnl __rcu **tnls[2]; struct ip6_tnl __rcu *collect_md_tun; }; static inline int ip6_tnl_mpls_supported(void) { return IS_ENABLED(CONFIG_MPLS); } #define for_each_ip6_tunnel_rcu(start) \ for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) /** * ip6_tnl_lookup - fetch tunnel matching the end-point addresses * @net: network namespace * @link: ifindex of underlying interface * @remote: the address of the tunnel exit-point * @local: the address of the tunnel entry-point * * Return: * tunnel matching given end-points if found, * else fallback tunnel if its device is up, * else %NULL **/ static struct ip6_tnl * ip6_tnl_lookup(struct net *net, int link, const struct in6_addr *remote, const struct in6_addr *local) { unsigned int hash = HASH(remote, local); struct ip6_tnl *t, *cand = NULL; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct in6_addr any; for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { if (!ipv6_addr_equal(local, &t->parms.laddr) || !ipv6_addr_equal(remote, &t->parms.raddr) || !(t->dev->flags & IFF_UP)) continue; if (link == t->parms.link) return t; else cand = t; } memset(&any, 0, sizeof(any)); hash = HASH(&any, local); for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { if (!ipv6_addr_equal(local, &t->parms.laddr) || !ipv6_addr_any(&t->parms.raddr) || !(t->dev->flags & IFF_UP)) continue; if (link == t->parms.link) return t; else if (!cand) cand = t; } hash = HASH(remote, &any); for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { if (!ipv6_addr_equal(remote, &t->parms.raddr) || !ipv6_addr_any(&t->parms.laddr) || !(t->dev->flags & IFF_UP)) continue; if (link == t->parms.link) return t; else if (!cand) cand = t; } if (cand) return cand; t = rcu_dereference(ip6n->collect_md_tun); if (t && t->dev->flags & IFF_UP) return t; t = rcu_dereference(ip6n->tnls_wc[0]); if (t && (t->dev->flags & IFF_UP)) return t; return NULL; } /** * ip6_tnl_bucket - get head of list matching given tunnel parameters * @ip6n: the private data for ip6_vti in the netns * @p: parameters containing tunnel end-points * * Description: * ip6_tnl_bucket() returns the head of the list matching the * &struct in6_addr entries laddr and raddr in @p. * * Return: head of IPv6 tunnel list **/ static struct ip6_tnl __rcu ** ip6_tnl_bucket(struct ip6_tnl_net *ip6n, const struct __ip6_tnl_parm *p) { const struct in6_addr *remote = &p->raddr; const struct in6_addr *local = &p->laddr; unsigned int h = 0; int prio = 0; if (!ipv6_addr_any(remote) || !ipv6_addr_any(local)) { prio = 1; h = HASH(remote, local); } return &ip6n->tnls[prio][h]; } /** * ip6_tnl_link - add tunnel to hash table * @ip6n: the private data for ip6_vti in the netns * @t: tunnel to be added **/ static void ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) { struct ip6_tnl __rcu **tp = ip6_tnl_bucket(ip6n, &t->parms); if (t->parms.collect_md) rcu_assign_pointer(ip6n->collect_md_tun, t); rcu_assign_pointer(t->next , rtnl_dereference(*tp)); rcu_assign_pointer(*tp, t); } /** * ip6_tnl_unlink - remove tunnel from hash table * @ip6n: the private data for ip6_vti in the netns * @t: tunnel to be removed **/ static void ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) { struct ip6_tnl __rcu **tp; struct ip6_tnl *iter; if (t->parms.collect_md) rcu_assign_pointer(ip6n->collect_md_tun, NULL); for (tp = ip6_tnl_bucket(ip6n, &t->parms); (iter = rtnl_dereference(*tp)) != NULL; tp = &iter->next) { if (t == iter) { rcu_assign_pointer(*tp, t->next); break; } } } static void ip6_dev_free(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); gro_cells_destroy(&t->gro_cells); dst_cache_destroy(&t->dst_cache); } static int ip6_tnl_create2(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct ip6_tnl_net *ip6n = net_generic(t->net, ip6_tnl_net_id); int err; dev->rtnl_link_ops = &ip6_link_ops; err = register_netdevice(dev); if (err < 0) goto out; strcpy(t->parms.name, dev->name); ip6_tnl_link(ip6n, t); return 0; out: return err; } /** * ip6_tnl_create - create a new tunnel * @net: network namespace * @p: tunnel parameters * * Description: * Create tunnel matching given parameters. * * Return: * created tunnel or error pointer **/ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct __ip6_tnl_parm *p) { struct net_device *dev; struct ip6_tnl *t; char name[IFNAMSIZ]; int err = -E2BIG; if (p->name[0]) { if (!dev_valid_name(p->name)) goto failed; strscpy(name, p->name, IFNAMSIZ); } else { sprintf(name, "ip6tnl%%d"); } err = -ENOMEM; dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, ip6_tnl_dev_setup); if (!dev) goto failed; dev_net_set(dev, net); t = netdev_priv(dev); t->parms = *p; t->net = dev_net(dev); err = ip6_tnl_create2(dev); if (err < 0) goto failed_free; return t; failed_free: free_netdev(dev); failed: return ERR_PTR(err); } /** * ip6_tnl_locate - find or create tunnel matching given parameters * @net: network namespace * @p: tunnel parameters * @create: != 0 if allowed to create new tunnel if no match found * * Description: * ip6_tnl_locate() first tries to locate an existing tunnel * based on @parms. If this is unsuccessful, but @create is set a new * tunnel device is created and registered for use. * * Return: * matching tunnel or error pointer **/ static struct ip6_tnl *ip6_tnl_locate(struct net *net, struct __ip6_tnl_parm *p, int create) { const struct in6_addr *remote = &p->raddr; const struct in6_addr *local = &p->laddr; struct ip6_tnl __rcu **tp; struct ip6_tnl *t; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); for (tp = ip6_tnl_bucket(ip6n, p); (t = rtnl_dereference(*tp)) != NULL; tp = &t->next) { if (ipv6_addr_equal(local, &t->parms.laddr) && ipv6_addr_equal(remote, &t->parms.raddr) && p->link == t->parms.link) { if (create) return ERR_PTR(-EEXIST); return t; } } if (!create) return ERR_PTR(-ENODEV); return ip6_tnl_create(net, p); } /** * ip6_tnl_dev_uninit - tunnel device uninitializer * @dev: the device to be destroyed * * Description: * ip6_tnl_dev_uninit() removes tunnel from its list **/ static void ip6_tnl_dev_uninit(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); if (dev == ip6n->fb_tnl_dev) RCU_INIT_POINTER(ip6n->tnls_wc[0], NULL); else ip6_tnl_unlink(ip6n, t); dst_cache_reset(&t->dst_cache); netdev_put(dev, &t->dev_tracker); } /** * ip6_tnl_parse_tlv_enc_lim - handle encapsulation limit option * @skb: received socket buffer * @raw: the ICMPv6 error message data * * Return: * 0 if none was found, * else index to encapsulation limit **/ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw) { const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)raw; unsigned int nhoff = raw - skb->data; unsigned int off = nhoff + sizeof(*ipv6h); u8 nexthdr = ipv6h->nexthdr; while (ipv6_ext_hdr(nexthdr) && nexthdr != NEXTHDR_NONE) { struct ipv6_opt_hdr *hdr; u16 optlen; if (!pskb_may_pull(skb, off + sizeof(*hdr))) break; hdr = (struct ipv6_opt_hdr *)(skb->data + off); if (nexthdr == NEXTHDR_FRAGMENT) { optlen = 8; } else if (nexthdr == NEXTHDR_AUTH) { optlen = ipv6_authlen(hdr); } else { optlen = ipv6_optlen(hdr); } if (!pskb_may_pull(skb, off + optlen)) break; hdr = (struct ipv6_opt_hdr *)(skb->data + off); if (nexthdr == NEXTHDR_FRAGMENT) { struct frag_hdr *frag_hdr = (struct frag_hdr *)hdr; if (frag_hdr->frag_off) break; } if (nexthdr == NEXTHDR_DEST) { u16 i = 2; while (1) { struct ipv6_tlv_tnl_enc_lim *tel; /* No more room for encapsulation limit */ if (i + sizeof(*tel) > optlen) break; tel = (struct ipv6_tlv_tnl_enc_lim *)(skb->data + off + i); /* return index of option if found and valid */ if (tel->type == IPV6_TLV_TNL_ENCAP_LIMIT && tel->length == 1) return i + off - nhoff; /* else jump to next option */ if (tel->type) i += tel->length + 2; else i++; } } nexthdr = hdr->nexthdr; off += optlen; } return 0; } EXPORT_SYMBOL(ip6_tnl_parse_tlv_enc_lim); /* ip6_tnl_err() should handle errors in the tunnel according to the * specifications in RFC 2473. */ static int ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, u8 *type, u8 *code, int *msg, __u32 *info, int offset) { const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)skb->data; struct net *net = dev_net(skb->dev); u8 rel_type = ICMPV6_DEST_UNREACH; u8 rel_code = ICMPV6_ADDR_UNREACH; __u32 rel_info = 0; struct ip6_tnl *t; int err = -ENOENT; int rel_msg = 0; u8 tproto; __u16 len; /* If the packet doesn't contain the original IPv6 header we are in trouble since we might need the source address for further processing of the error. */ rcu_read_lock(); t = ip6_tnl_lookup(dev_net(skb->dev), skb->dev->ifindex, &ipv6h->daddr, &ipv6h->saddr); if (!t) goto out; tproto = READ_ONCE(t->parms.proto); if (tproto != ipproto && tproto != 0) goto out; err = 0; switch (*type) { case ICMPV6_DEST_UNREACH: net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n", t->parms.name); rel_msg = 1; break; case ICMPV6_TIME_EXCEED: if ((*code) == ICMPV6_EXC_HOPLIMIT) { net_dbg_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", t->parms.name); rel_msg = 1; } break; case ICMPV6_PARAMPROB: { struct ipv6_tlv_tnl_enc_lim *tel; __u32 teli; teli = 0; if ((*code) == ICMPV6_HDR_FIELD) teli = ip6_tnl_parse_tlv_enc_lim(skb, skb->data); if (teli && teli == *info - 2) { tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli]; if (tel->encap_limit == 0) { net_dbg_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", t->parms.name); rel_msg = 1; } } else { net_dbg_ratelimited("%s: Recipient unable to parse tunneled packet!\n", t->parms.name); } break; } case ICMPV6_PKT_TOOBIG: { __u32 mtu; ip6_update_pmtu(skb, net, htonl(*info), 0, 0, sock_net_uid(net, NULL)); mtu = *info - offset; if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; len = sizeof(*ipv6h) + ntohs(ipv6h->payload_len); if (len > mtu) { rel_type = ICMPV6_PKT_TOOBIG; rel_code = 0; rel_info = mtu; rel_msg = 1; } break; } case NDISC_REDIRECT: ip6_redirect(skb, net, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); break; } *type = rel_type; *code = rel_code; *info = rel_info; *msg = rel_msg; out: rcu_read_unlock(); return err; } static int ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { __u32 rel_info = ntohl(info); const struct iphdr *eiph; struct sk_buff *skb2; int err, rel_msg = 0; u8 rel_type = type; u8 rel_code = code; struct rtable *rt; struct flowi4 fl4; err = ip6_tnl_err(skb, IPPROTO_IPIP, opt, &rel_type, &rel_code, &rel_msg, &rel_info, offset); if (err < 0) return err; if (rel_msg == 0) return 0; switch (rel_type) { case ICMPV6_DEST_UNREACH: if (rel_code != ICMPV6_ADDR_UNREACH) return 0; rel_type = ICMP_DEST_UNREACH; rel_code = ICMP_HOST_UNREACH; break; case ICMPV6_PKT_TOOBIG: if (rel_code != 0) return 0; rel_type = ICMP_DEST_UNREACH; rel_code = ICMP_FRAG_NEEDED; break; default: return 0; } if (!pskb_may_pull(skb, offset + sizeof(struct iphdr))) return 0; skb2 = skb_clone(skb, GFP_ATOMIC); if (!skb2) return 0; skb_dst_drop(skb2); skb_pull(skb2, offset); skb_reset_network_header(skb2); eiph = ip_hdr(skb2); /* Try to guess incoming interface */ rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, eiph->saddr, 0, 0, 0, IPPROTO_IPIP, eiph->tos & INET_DSCP_MASK, 0); if (IS_ERR(rt)) goto out; skb2->dev = rt->dst.dev; ip_rt_put(rt); /* route "incoming" packet */ if (rt->rt_flags & RTCF_LOCAL) { rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, eiph->daddr, eiph->saddr, 0, 0, IPPROTO_IPIP, eiph->tos & INET_DSCP_MASK, 0); if (IS_ERR(rt) || rt->dst.dev->type != ARPHRD_TUNNEL6) { if (!IS_ERR(rt)) ip_rt_put(rt); goto out; } skb_dst_set(skb2, &rt->dst); } else { if (ip_route_input(skb2, eiph->daddr, eiph->saddr, ip4h_dscp(eiph), skb2->dev) || skb_dst(skb2)->dev->type != ARPHRD_TUNNEL6) goto out; } /* change mtu on this route */ if (rel_type == ICMP_DEST_UNREACH && rel_code == ICMP_FRAG_NEEDED) { if (rel_info > dst_mtu(skb_dst(skb2))) goto out; skb_dst_update_pmtu_no_confirm(skb2, rel_info); } icmp_send(skb2, rel_type, rel_code, htonl(rel_info)); out: kfree_skb(skb2); return 0; } static int ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { __u32 rel_info = ntohl(info); int err, rel_msg = 0; u8 rel_type = type; u8 rel_code = code; err = ip6_tnl_err(skb, IPPROTO_IPV6, opt, &rel_type, &rel_code, &rel_msg, &rel_info, offset); if (err < 0) return err; if (rel_msg && pskb_may_pull(skb, offset + sizeof(struct ipv6hdr))) { struct rt6_info *rt; struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (!skb2) return 0; skb_dst_drop(skb2); skb_pull(skb2, offset); skb_reset_network_header(skb2); /* Try to guess incoming interface */ rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0, skb2, 0); if (rt && rt->dst.dev) skb2->dev = rt->dst.dev; icmpv6_send(skb2, rel_type, rel_code, rel_info); ip6_rt_put(rt); kfree_skb(skb2); } return 0; } static int mplsip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { __u32 rel_info = ntohl(info); int err, rel_msg = 0; u8 rel_type = type; u8 rel_code = code; err = ip6_tnl_err(skb, IPPROTO_MPLS, opt, &rel_type, &rel_code, &rel_msg, &rel_info, offset); return err; } static int ip4ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb) { __u8 dsfield = ipv6_get_dsfield(ipv6h) & ~INET_ECN_MASK; if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY) ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, dsfield); return IP6_ECN_decapsulate(ipv6h, skb); } static int ip6ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb) { if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY) ipv6_copy_dscp(ipv6_get_dsfield(ipv6h), ipv6_hdr(skb)); return IP6_ECN_decapsulate(ipv6h, skb); } static inline int mplsip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb) { /* ECN is not supported in AF_MPLS */ return 0; } __u32 ip6_tnl_get_cap(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr) { struct __ip6_tnl_parm *p = &t->parms; int ltype = ipv6_addr_type(laddr); int rtype = ipv6_addr_type(raddr); __u32 flags = 0; if (ltype == IPV6_ADDR_ANY || rtype == IPV6_ADDR_ANY) { flags = IP6_TNL_F_CAP_PER_PACKET; } else if (ltype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) && rtype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) && !((ltype|rtype) & IPV6_ADDR_LOOPBACK) && (!((ltype|rtype) & IPV6_ADDR_LINKLOCAL) || p->link)) { if (ltype&IPV6_ADDR_UNICAST) flags |= IP6_TNL_F_CAP_XMIT; if (rtype&IPV6_ADDR_UNICAST) flags |= IP6_TNL_F_CAP_RCV; } return flags; } EXPORT_SYMBOL(ip6_tnl_get_cap); /* called with rcu_read_lock() */ int ip6_tnl_rcv_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr) { struct __ip6_tnl_parm *p = &t->parms; int ret = 0; struct net *net = t->net; if ((p->flags & IP6_TNL_F_CAP_RCV) || ((p->flags & IP6_TNL_F_CAP_PER_PACKET) && (ip6_tnl_get_cap(t, laddr, raddr) & IP6_TNL_F_CAP_RCV))) { struct net_device *ldev = NULL; if (p->link) ldev = dev_get_by_index_rcu(net, p->link); if ((ipv6_addr_is_multicast(laddr) || likely(ipv6_chk_addr_and_flags(net, laddr, ldev, false, 0, IFA_F_TENTATIVE))) && ((p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) || likely(!ipv6_chk_addr_and_flags(net, raddr, ldev, true, 0, IFA_F_TENTATIVE)))) ret = 1; } return ret; } EXPORT_SYMBOL_GPL(ip6_tnl_rcv_ctl); static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb), bool log_ecn_err) { const struct ipv6hdr *ipv6h; int nh, err; if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.i_flags) != test_bit(IP_TUNNEL_CSUM_BIT, tpi->flags)) { DEV_STATS_INC(tunnel->dev, rx_crc_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.i_flags)) { if (!test_bit(IP_TUNNEL_SEQ_BIT, tpi->flags) || (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) { DEV_STATS_INC(tunnel->dev, rx_fifo_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } tunnel->i_seqno = ntohl(tpi->seq) + 1; } skb->protocol = tpi->proto; /* Warning: All skb pointers will be invalidated! */ if (tunnel->dev->type == ARPHRD_ETHER) { if (!pskb_may_pull(skb, ETH_HLEN)) { DEV_STATS_INC(tunnel->dev, rx_length_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } skb->protocol = eth_type_trans(skb, tunnel->dev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); } else { skb->dev = tunnel->dev; skb_reset_mac_header(skb); } /* Save offset of outer header relative to skb->head, * because we are going to reset the network header to the inner header * and might change skb->head. */ nh = skb_network_header(skb) - skb->head; skb_reset_network_header(skb); if (!pskb_inet_may_pull(skb)) { DEV_STATS_INC(tunnel->dev, rx_length_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } /* Get the outer header. */ ipv6h = (struct ipv6hdr *)(skb->head + nh); memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); __skb_tunnel_rx(skb, tunnel->dev, tunnel->net); err = dscp_ecn_decapsulate(tunnel, ipv6h, skb); if (unlikely(err)) { if (log_ecn_err) net_info_ratelimited("non-ECT from %pI6 with DS=%#x\n", &ipv6h->saddr, ipv6_get_dsfield(ipv6h)); if (err > 1) { DEV_STATS_INC(tunnel->dev, rx_frame_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } } dev_sw_netstats_rx_add(tunnel->dev, skb->len); skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev))); if (tun_dst) skb_dst_set(skb, (struct dst_entry *)tun_dst); gro_cells_receive(&tunnel->gro_cells, skb); return 0; drop: if (tun_dst) dst_release((struct dst_entry *)tun_dst); kfree_skb(skb); return 0; } int ip6_tnl_rcv(struct ip6_tnl *t, struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, bool log_ecn_err) { int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb); dscp_ecn_decapsulate = ip6ip6_dscp_ecn_decapsulate; if (tpi->proto == htons(ETH_P_IP)) dscp_ecn_decapsulate = ip4ip6_dscp_ecn_decapsulate; return __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate, log_ecn_err); } EXPORT_SYMBOL(ip6_tnl_rcv); static const struct tnl_ptk_info tpi_v6 = { /* no tunnel info required for ipxip6. */ .proto = htons(ETH_P_IPV6), }; static const struct tnl_ptk_info tpi_v4 = { /* no tunnel info required for ipxip6. */ .proto = htons(ETH_P_IP), }; static const struct tnl_ptk_info tpi_mpls = { /* no tunnel info required for mplsip6. */ .proto = htons(ETH_P_MPLS_UC), }; static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto, const struct tnl_ptk_info *tpi, int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb)) { struct ip6_tnl *t; const struct ipv6hdr *ipv6h = ipv6_hdr(skb); struct metadata_dst *tun_dst = NULL; int ret = -1; rcu_read_lock(); t = ip6_tnl_lookup(dev_net(skb->dev), skb->dev->ifindex, &ipv6h->saddr, &ipv6h->daddr); if (t) { u8 tproto = READ_ONCE(t->parms.proto); if (tproto != ipproto && tproto != 0) goto drop; if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; ipv6h = ipv6_hdr(skb); if (!ip6_tnl_rcv_ctl(t, &ipv6h->daddr, &ipv6h->saddr)) goto drop; if (iptunnel_pull_header(skb, 0, tpi->proto, false)) goto drop; if (t->parms.collect_md) { IP_TUNNEL_DECLARE_FLAGS(flags) = { }; tun_dst = ipv6_tun_rx_dst(skb, flags, 0, 0); if (!tun_dst) goto drop; } ret = __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate, log_ecn_error); } rcu_read_unlock(); return ret; drop: rcu_read_unlock(); kfree_skb(skb); return 0; } static int ip4ip6_rcv(struct sk_buff *skb) { return ipxip6_rcv(skb, IPPROTO_IPIP, &tpi_v4, ip4ip6_dscp_ecn_decapsulate); } static int ip6ip6_rcv(struct sk_buff *skb) { return ipxip6_rcv(skb, IPPROTO_IPV6, &tpi_v6, ip6ip6_dscp_ecn_decapsulate); } static int mplsip6_rcv(struct sk_buff *skb) { return ipxip6_rcv(skb, IPPROTO_MPLS, &tpi_mpls, mplsip6_dscp_ecn_decapsulate); } struct ipv6_tel_txoption { struct ipv6_txoptions ops; __u8 dst_opt[8]; }; static void init_tel_txopt(struct ipv6_tel_txoption *opt, __u8 encap_limit) { memset(opt, 0, sizeof(struct ipv6_tel_txoption)); opt->dst_opt[2] = IPV6_TLV_TNL_ENCAP_LIMIT; opt->dst_opt[3] = 1; opt->dst_opt[4] = encap_limit; opt->dst_opt[5] = IPV6_TLV_PADN; opt->dst_opt[6] = 1; opt->ops.dst1opt = (struct ipv6_opt_hdr *) opt->dst_opt; opt->ops.opt_nflen = 8; } /** * ip6_tnl_addr_conflict - compare packet addresses to tunnel's own * @t: the outgoing tunnel device * @hdr: IPv6 header from the incoming packet * * Description: * Avoid trivial tunneling loop by checking that tunnel exit-point * doesn't match source of incoming packet. * * Return: * 1 if conflict, * 0 else **/ static inline bool ip6_tnl_addr_conflict(const struct ip6_tnl *t, const struct ipv6hdr *hdr) { return ipv6_addr_equal(&t->parms.raddr, &hdr->saddr); } int ip6_tnl_xmit_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr) { struct __ip6_tnl_parm *p = &t->parms; int ret = 0; struct net *net = t->net; if (t->parms.collect_md) return 1; if ((p->flags & IP6_TNL_F_CAP_XMIT) || ((p->flags & IP6_TNL_F_CAP_PER_PACKET) && (ip6_tnl_get_cap(t, laddr, raddr) & IP6_TNL_F_CAP_XMIT))) { struct net_device *ldev = NULL; rcu_read_lock(); if (p->link) ldev = dev_get_by_index_rcu(net, p->link); if (unlikely(!ipv6_chk_addr_and_flags(net, laddr, ldev, false, 0, IFA_F_TENTATIVE))) pr_warn_ratelimited("%s xmit: Local address not yet configured!\n", p->name); else if (!(p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) && !ipv6_addr_is_multicast(raddr) && unlikely(ipv6_chk_addr_and_flags(net, raddr, ldev, true, 0, IFA_F_TENTATIVE))) pr_warn_ratelimited("%s xmit: Routing loop! Remote address found on this node!\n", p->name); else ret = 1; rcu_read_unlock(); } return ret; } EXPORT_SYMBOL_GPL(ip6_tnl_xmit_ctl); /** * ip6_tnl_xmit - encapsulate packet and send * @skb: the outgoing socket buffer * @dev: the outgoing tunnel device * @dsfield: dscp code for outer header * @fl6: flow of tunneled packet * @encap_limit: encapsulation limit * @pmtu: Path MTU is stored if packet is too big * @proto: next header value * * Description: * Build new header and do some sanity checks on the packet before sending * it. * * Return: * 0 on success * -1 fail * %-EMSGSIZE message too big. return mtu in this case. **/ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, struct flowi6 *fl6, int encap_limit, __u32 *pmtu, __u8 proto) { struct ip6_tnl *t = netdev_priv(dev); struct net *net = t->net; struct ipv6hdr *ipv6h; struct ipv6_tel_txoption opt; struct dst_entry *dst = NULL, *ndst = NULL; struct net_device *tdev; int mtu; unsigned int eth_hlen = t->dev->type == ARPHRD_ETHER ? ETH_HLEN : 0; unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen; unsigned int max_headroom = psh_hlen; __be16 payload_protocol; bool use_cache = false; u8 hop_limit; int err = -1; payload_protocol = skb_protocol(skb, true); if (t->parms.collect_md) { hop_limit = skb_tunnel_info(skb)->key.ttl; goto route_lookup; } else { hop_limit = t->parms.hop_limit; } /* NBMA tunnel */ if (ipv6_addr_any(&t->parms.raddr)) { if (payload_protocol == htons(ETH_P_IPV6)) { struct in6_addr *addr6; struct neighbour *neigh; int addr_type; if (!skb_dst(skb)) goto tx_err_link_failure; neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr); if (!neigh) goto tx_err_link_failure; addr6 = (struct in6_addr *)&neigh->primary_key; addr_type = ipv6_addr_type(addr6); if (addr_type == IPV6_ADDR_ANY) addr6 = &ipv6_hdr(skb)->daddr; memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr)); neigh_release(neigh); } else if (payload_protocol == htons(ETH_P_IP)) { const struct rtable *rt = skb_rtable(skb); if (!rt) goto tx_err_link_failure; if (rt->rt_gw_family == AF_INET6) memcpy(&fl6->daddr, &rt->rt_gw6, sizeof(fl6->daddr)); } } else if (t->parms.proto != 0 && !(t->parms.flags & (IP6_TNL_F_USE_ORIG_TCLASS | IP6_TNL_F_USE_ORIG_FWMARK))) { /* enable the cache only if neither the outer protocol nor the * routing decision depends on the current inner header value */ use_cache = true; } if (use_cache) dst = dst_cache_get(&t->dst_cache); if (!ip6_tnl_xmit_ctl(t, &fl6->saddr, &fl6->daddr)) goto tx_err_link_failure; if (!dst) { route_lookup: /* add dsfield to flowlabel for route lookup */ fl6->flowlabel = ip6_make_flowinfo(dsfield, fl6->flowlabel); dst = ip6_route_output(net, NULL, fl6); if (dst->error) goto tx_err_link_failure; dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), NULL, 0); if (IS_ERR(dst)) { err = PTR_ERR(dst); dst = NULL; goto tx_err_link_failure; } if (t->parms.collect_md && ipv6_addr_any(&fl6->saddr) && ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev, &fl6->daddr, 0, &fl6->saddr)) goto tx_err_link_failure; ndst = dst; } tdev = dst->dev; if (tdev == dev) { DEV_STATS_INC(dev, collisions); net_warn_ratelimited("%s: Local routing loop detected!\n", t->parms.name); goto tx_err_dst_release; } mtu = dst_mtu(dst) - eth_hlen - psh_hlen - t->tun_hlen; if (encap_limit >= 0) { max_headroom += 8; mtu -= 8; } mtu = max(mtu, skb->protocol == htons(ETH_P_IPV6) ? IPV6_MIN_MTU : IPV4_MIN_MTU); skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) { *pmtu = mtu; err = -EMSGSIZE; goto tx_err_dst_release; } if (t->err_count > 0) { if (time_before(jiffies, t->err_time + IP6TUNNEL_ERR_TIMEO)) { t->err_count--; dst_link_failure(skb); } else { t->err_count = 0; } } skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev))); /* * Okay, now see if we can stuff it in the buffer as-is. */ max_headroom += LL_RESERVED_SPACE(tdev); if (skb_headroom(skb) < max_headroom || skb_shared(skb) || (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { struct sk_buff *new_skb; new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) goto tx_err_dst_release; if (skb->sk) skb_set_owner_w(new_skb, skb->sk); consume_skb(skb); skb = new_skb; } if (t->parms.collect_md) { if (t->encap.type != TUNNEL_ENCAP_NONE) goto tx_err_dst_release; } else { if (use_cache && ndst) dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr); } skb_dst_set(skb, dst); if (hop_limit == 0) { if (payload_protocol == htons(ETH_P_IP)) hop_limit = ip_hdr(skb)->ttl; else if (payload_protocol == htons(ETH_P_IPV6)) hop_limit = ipv6_hdr(skb)->hop_limit; else hop_limit = ip6_dst_hoplimit(dst); } /* Calculate max headroom for all the headers and adjust * needed_headroom if necessary. */ max_headroom = LL_RESERVED_SPACE(dst->dev) + sizeof(struct ipv6hdr) + dst->header_len + t->hlen; if (max_headroom > READ_ONCE(dev->needed_headroom)) WRITE_ONCE(dev->needed_headroom, max_headroom); err = ip6_tnl_encap(skb, t, &proto, fl6); if (err) return err; if (encap_limit >= 0) { init_tel_txopt(&opt, encap_limit); ipv6_push_frag_opts(skb, &opt.ops, &proto); } skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); ipv6h = ipv6_hdr(skb); ip6_flow_hdr(ipv6h, dsfield, ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6)); ipv6h->hop_limit = hop_limit; ipv6h->nexthdr = proto; ipv6h->saddr = fl6->saddr; ipv6h->daddr = fl6->daddr; ip6tunnel_xmit(NULL, skb, dev); return 0; tx_err_link_failure: DEV_STATS_INC(dev, tx_carrier_errors); dst_link_failure(skb); tx_err_dst_release: dst_release(dst); return err; } EXPORT_SYMBOL(ip6_tnl_xmit); static inline int ipxip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, u8 protocol) { struct ip6_tnl *t = netdev_priv(dev); struct ipv6hdr *ipv6h; const struct iphdr *iph; int encap_limit = -1; __u16 offset; struct flowi6 fl6; __u8 dsfield, orig_dsfield; __u32 mtu; u8 tproto; int err; tproto = READ_ONCE(t->parms.proto); if (tproto != protocol && tproto != 0) return -1; if (t->parms.collect_md) { struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; tun_info = skb_tunnel_info(skb); if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || ip_tunnel_info_af(tun_info) != AF_INET6)) return -1; key = &tun_info->key; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = protocol; fl6.saddr = key->u.ipv6.src; fl6.daddr = key->u.ipv6.dst; fl6.flowlabel = key->label; dsfield = key->tos; switch (protocol) { case IPPROTO_IPIP: iph = ip_hdr(skb); orig_dsfield = ipv4_get_dsfield(iph); break; case IPPROTO_IPV6: ipv6h = ipv6_hdr(skb); orig_dsfield = ipv6_get_dsfield(ipv6h); break; default: orig_dsfield = dsfield; break; } } else { if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) encap_limit = t->parms.encap_limit; if (protocol == IPPROTO_IPV6) { offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); /* ip6_tnl_parse_tlv_enc_lim() might have * reallocated skb->head */ if (offset > 0) { struct ipv6_tlv_tnl_enc_lim *tel; tel = (void *)&skb_network_header(skb)[offset]; if (tel->encap_limit == 0) { icmpv6_ndo_send(skb, ICMPV6_PARAMPROB, ICMPV6_HDR_FIELD, offset + 2); return -1; } encap_limit = tel->encap_limit - 1; } } memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); fl6.flowi6_proto = protocol; if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) fl6.flowi6_mark = skb->mark; else fl6.flowi6_mark = t->parms.fwmark; switch (protocol) { case IPPROTO_IPIP: iph = ip_hdr(skb); orig_dsfield = ipv4_get_dsfield(iph); if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) dsfield = orig_dsfield; else dsfield = ip6_tclass(t->parms.flowinfo); break; case IPPROTO_IPV6: ipv6h = ipv6_hdr(skb); orig_dsfield = ipv6_get_dsfield(ipv6h); if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) dsfield = orig_dsfield; else dsfield = ip6_tclass(t->parms.flowinfo); if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) fl6.flowlabel |= ip6_flowlabel(ipv6h); break; default: orig_dsfield = dsfield = ip6_tclass(t->parms.flowinfo); break; } } fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); dsfield = INET_ECN_encapsulate(dsfield, orig_dsfield); if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6)) return -1; skb_set_inner_ipproto(skb, protocol); err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, protocol); if (err != 0) { /* XXX: send ICMP error even if DF is not set. */ if (err == -EMSGSIZE) switch (protocol) { case IPPROTO_IPIP: icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); break; case IPPROTO_IPV6: icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); break; default: break; } return -1; } return 0; } static netdev_tx_t ip6_tnl_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); u8 ipproto; int ret; if (!pskb_inet_may_pull(skb)) goto tx_err; switch (skb->protocol) { case htons(ETH_P_IP): ipproto = IPPROTO_IPIP; break; case htons(ETH_P_IPV6): if (ip6_tnl_addr_conflict(t, ipv6_hdr(skb))) goto tx_err; ipproto = IPPROTO_IPV6; break; case htons(ETH_P_MPLS_UC): ipproto = IPPROTO_MPLS; break; default: goto tx_err; } ret = ipxip6_tnl_xmit(skb, dev, ipproto); if (ret < 0) goto tx_err; return NETDEV_TX_OK; tx_err: DEV_STATS_INC(dev, tx_errors); DEV_STATS_INC(dev, tx_dropped); kfree_skb(skb); return NETDEV_TX_OK; } static void ip6_tnl_link_config(struct ip6_tnl *t) { struct net_device *dev = t->dev; struct net_device *tdev = NULL; struct __ip6_tnl_parm *p = &t->parms; struct flowi6 *fl6 = &t->fl.u.ip6; int t_hlen; int mtu; __dev_addr_set(dev, &p->laddr, sizeof(struct in6_addr)); memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr)); /* Set up flowi template */ fl6->saddr = p->laddr; fl6->daddr = p->raddr; fl6->flowi6_oif = p->link; fl6->flowlabel = 0; if (!(p->flags&IP6_TNL_F_USE_ORIG_TCLASS)) fl6->flowlabel |= IPV6_TCLASS_MASK & p->flowinfo; if (!(p->flags&IP6_TNL_F_USE_ORIG_FLOWLABEL)) fl6->flowlabel |= IPV6_FLOWLABEL_MASK & p->flowinfo; p->flags &= ~(IP6_TNL_F_CAP_XMIT|IP6_TNL_F_CAP_RCV|IP6_TNL_F_CAP_PER_PACKET); p->flags |= ip6_tnl_get_cap(t, &p->laddr, &p->raddr); if (p->flags&IP6_TNL_F_CAP_XMIT && p->flags&IP6_TNL_F_CAP_RCV) dev->flags |= IFF_POINTOPOINT; else dev->flags &= ~IFF_POINTOPOINT; t->tun_hlen = 0; t->hlen = t->encap_hlen + t->tun_hlen; t_hlen = t->hlen + sizeof(struct ipv6hdr); if (p->flags & IP6_TNL_F_CAP_XMIT) { int strict = (ipv6_addr_type(&p->raddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL)); struct rt6_info *rt = rt6_lookup(t->net, &p->raddr, &p->laddr, p->link, NULL, strict); if (rt) { tdev = rt->dst.dev; ip6_rt_put(rt); } if (!tdev && p->link) tdev = __dev_get_by_index(t->net, p->link); if (tdev) { dev->needed_headroom = tdev->hard_header_len + tdev->needed_headroom + t_hlen; mtu = min_t(unsigned int, tdev->mtu, IP6_MAX_MTU); mtu = mtu - t_hlen; if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) mtu -= 8; if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; WRITE_ONCE(dev->mtu, mtu); } } } /** * ip6_tnl_change - update the tunnel parameters * @t: tunnel to be changed * @p: tunnel configuration parameters * * Description: * ip6_tnl_change() updates the tunnel parameters **/ static void ip6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p) { t->parms.laddr = p->laddr; t->parms.raddr = p->raddr; t->parms.flags = p->flags; t->parms.hop_limit = p->hop_limit; t->parms.encap_limit = p->encap_limit; t->parms.flowinfo = p->flowinfo; t->parms.link = p->link; t->parms.proto = p->proto; t->parms.fwmark = p->fwmark; dst_cache_reset(&t->dst_cache); ip6_tnl_link_config(t); } static void ip6_tnl_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p) { struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); ip6_tnl_unlink(ip6n, t); synchronize_net(); ip6_tnl_change(t, p); ip6_tnl_link(ip6n, t); netdev_state_change(t->dev); } static void ip6_tnl0_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p) { /* for default tnl0 device allow to change only the proto */ t->parms.proto = p->proto; netdev_state_change(t->dev); } static void ip6_tnl_parm_from_user(struct __ip6_tnl_parm *p, const struct ip6_tnl_parm *u) { p->laddr = u->laddr; p->raddr = u->raddr; p->flags = u->flags; p->hop_limit = u->hop_limit; p->encap_limit = u->encap_limit; p->flowinfo = u->flowinfo; p->link = u->link; p->proto = u->proto; memcpy(p->name, u->name, sizeof(u->name)); } static void ip6_tnl_parm_to_user(struct ip6_tnl_parm *u, const struct __ip6_tnl_parm *p) { u->laddr = p->laddr; u->raddr = p->raddr; u->flags = p->flags; u->hop_limit = p->hop_limit; u->encap_limit = p->encap_limit; u->flowinfo = p->flowinfo; u->link = p->link; u->proto = p->proto; memcpy(u->name, p->name, sizeof(u->name)); } /** * ip6_tnl_siocdevprivate - configure ipv6 tunnels from userspace * @dev: virtual device associated with tunnel * @ifr: unused * @data: parameters passed from userspace * @cmd: command to be performed * * Description: * ip6_tnl_ioctl() is used for managing IPv6 tunnels * from userspace. * * The possible commands are the following: * %SIOCGETTUNNEL: get tunnel parameters for device * %SIOCADDTUNNEL: add tunnel matching given tunnel parameters * %SIOCCHGTUNNEL: change tunnel parameters to those given * %SIOCDELTUNNEL: delete tunnel * * The fallback device "ip6tnl0", created during module * initialization, can be used for creating other tunnel devices. * * Return: * 0 on success, * %-EFAULT if unable to copy data to or from userspace, * %-EPERM if current process hasn't %CAP_NET_ADMIN set * %-EINVAL if passed tunnel parameters are invalid, * %-EEXIST if changing a tunnel's parameters would cause a conflict * %-ENODEV if attempting to change or delete a nonexisting device **/ static int ip6_tnl_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd) { int err = 0; struct ip6_tnl_parm p; struct __ip6_tnl_parm p1; struct ip6_tnl *t = netdev_priv(dev); struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); memset(&p1, 0, sizeof(p1)); switch (cmd) { case SIOCGETTUNNEL: if (dev == ip6n->fb_tnl_dev) { if (copy_from_user(&p, data, sizeof(p))) { err = -EFAULT; break; } ip6_tnl_parm_from_user(&p1, &p); t = ip6_tnl_locate(net, &p1, 0); if (IS_ERR(t)) t = netdev_priv(dev); } else { memset(&p, 0, sizeof(p)); } ip6_tnl_parm_to_user(&p, &t->parms); if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; break; case SIOCADDTUNNEL: case SIOCCHGTUNNEL: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = -EFAULT; if (copy_from_user(&p, data, sizeof(p))) break; err = -EINVAL; if (p.proto != IPPROTO_IPV6 && p.proto != IPPROTO_IPIP && p.proto != 0) break; ip6_tnl_parm_from_user(&p1, &p); t = ip6_tnl_locate(net, &p1, cmd == SIOCADDTUNNEL); if (cmd == SIOCCHGTUNNEL) { if (!IS_ERR(t)) { if (t->dev != dev) { err = -EEXIST; break; } } else t = netdev_priv(dev); if (dev == ip6n->fb_tnl_dev) ip6_tnl0_update(t, &p1); else ip6_tnl_update(t, &p1); } if (!IS_ERR(t)) { err = 0; ip6_tnl_parm_to_user(&p, &t->parms); if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; } else { err = PTR_ERR(t); } break; case SIOCDELTUNNEL: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; if (dev == ip6n->fb_tnl_dev) { err = -EFAULT; if (copy_from_user(&p, data, sizeof(p))) break; err = -ENOENT; ip6_tnl_parm_from_user(&p1, &p); t = ip6_tnl_locate(net, &p1, 0); if (IS_ERR(t)) break; err = -EPERM; if (t->dev == ip6n->fb_tnl_dev) break; dev = t->dev; } err = 0; unregister_netdevice(dev); break; default: err = -EINVAL; } return err; } /** * ip6_tnl_change_mtu - change mtu manually for tunnel device * @dev: virtual device associated with tunnel * @new_mtu: the new mtu * * Return: * 0 on success, * %-EINVAL if mtu too small **/ int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) { struct ip6_tnl *tnl = netdev_priv(dev); int t_hlen; t_hlen = tnl->hlen + sizeof(struct ipv6hdr); if (tnl->parms.proto == IPPROTO_IPV6) { if (new_mtu < IPV6_MIN_MTU) return -EINVAL; } else { if (new_mtu < ETH_MIN_MTU) return -EINVAL; } if (tnl->parms.proto == IPPROTO_IPV6 || tnl->parms.proto == 0) { if (new_mtu > IP6_MAX_MTU - dev->hard_header_len - t_hlen) return -EINVAL; } else { if (new_mtu > IP_MAX_MTU - dev->hard_header_len - t_hlen) return -EINVAL; } WRITE_ONCE(dev->mtu, new_mtu); return 0; } EXPORT_SYMBOL(ip6_tnl_change_mtu); int ip6_tnl_get_iflink(const struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); return READ_ONCE(t->parms.link); } EXPORT_SYMBOL(ip6_tnl_get_iflink); int ip6_tnl_encap_add_ops(const struct ip6_tnl_encap_ops *ops, unsigned int num) { if (num >= MAX_IPTUN_ENCAP_OPS) return -ERANGE; return !cmpxchg((const struct ip6_tnl_encap_ops **) &ip6tun_encaps[num], NULL, ops) ? 0 : -1; } EXPORT_SYMBOL(ip6_tnl_encap_add_ops); int ip6_tnl_encap_del_ops(const struct ip6_tnl_encap_ops *ops, unsigned int num) { int ret; if (num >= MAX_IPTUN_ENCAP_OPS) return -ERANGE; ret = (cmpxchg((const struct ip6_tnl_encap_ops **) &ip6tun_encaps[num], ops, NULL) == ops) ? 0 : -1; synchronize_net(); return ret; } EXPORT_SYMBOL(ip6_tnl_encap_del_ops); int ip6_tnl_encap_setup(struct ip6_tnl *t, struct ip_tunnel_encap *ipencap) { int hlen; memset(&t->encap, 0, sizeof(t->encap)); hlen = ip6_encap_hlen(ipencap); if (hlen < 0) return hlen; t->encap.type = ipencap->type; t->encap.sport = ipencap->sport; t->encap.dport = ipencap->dport; t->encap.flags = ipencap->flags; t->encap_hlen = hlen; t->hlen = t->encap_hlen + t->tun_hlen; return 0; } EXPORT_SYMBOL_GPL(ip6_tnl_encap_setup); static const struct net_device_ops ip6_tnl_netdev_ops = { .ndo_init = ip6_tnl_dev_init, .ndo_uninit = ip6_tnl_dev_uninit, .ndo_start_xmit = ip6_tnl_start_xmit, .ndo_siocdevprivate = ip6_tnl_siocdevprivate, .ndo_change_mtu = ip6_tnl_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; #define IPXIPX_FEATURES (NETIF_F_SG | \ NETIF_F_FRAGLIST | \ NETIF_F_HIGHDMA | \ NETIF_F_GSO_SOFTWARE | \ NETIF_F_HW_CSUM) /** * ip6_tnl_dev_setup - setup virtual tunnel device * @dev: virtual device associated with tunnel * * Description: * Initialize function pointers and device parameters **/ static void ip6_tnl_dev_setup(struct net_device *dev) { dev->netdev_ops = &ip6_tnl_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; dev->needs_free_netdev = true; dev->priv_destructor = ip6_dev_free; dev->type = ARPHRD_TUNNEL6; dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); dev->lltx = true; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; netif_keep_dst(dev); dev->features |= IPXIPX_FEATURES; dev->hw_features |= IPXIPX_FEATURES; /* This perm addr will be used as interface identifier by IPv6 */ dev->addr_assign_type = NET_ADDR_RANDOM; eth_random_addr(dev->perm_addr); } /** * ip6_tnl_dev_init_gen - general initializer for all tunnel devices * @dev: virtual device associated with tunnel **/ static inline int ip6_tnl_dev_init_gen(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); int ret; int t_hlen; t->dev = dev; ret = dst_cache_init(&t->dst_cache, GFP_KERNEL); if (ret) return ret; ret = gro_cells_init(&t->gro_cells, dev); if (ret) goto destroy_dst; t->tun_hlen = 0; t->hlen = t->encap_hlen + t->tun_hlen; t_hlen = t->hlen + sizeof(struct ipv6hdr); dev->type = ARPHRD_TUNNEL6; dev->mtu = ETH_DATA_LEN - t_hlen; if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) dev->mtu -= 8; dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = IP6_MAX_MTU - dev->hard_header_len - t_hlen; netdev_hold(dev, &t->dev_tracker, GFP_KERNEL); netdev_lockdep_set_classes(dev); return 0; destroy_dst: dst_cache_destroy(&t->dst_cache); return ret; } /** * ip6_tnl_dev_init - initializer for all non fallback tunnel devices * @dev: virtual device associated with tunnel **/ static int ip6_tnl_dev_init(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); int err = ip6_tnl_dev_init_gen(dev); if (err) return err; ip6_tnl_link_config(t); if (t->parms.collect_md) netif_keep_dst(dev); return 0; } /** * ip6_fb_tnl_dev_init - initializer for fallback tunnel device * @dev: fallback device * * Return: 0 **/ static int __net_init ip6_fb_tnl_dev_init(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct net *net = dev_net(dev); struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); t->net = net; t->parms.proto = IPPROTO_IPV6; rcu_assign_pointer(ip6n->tnls_wc[0], t); return 0; } static int ip6_tnl_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { u8 proto; if (!data || !data[IFLA_IPTUN_PROTO]) return 0; proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); if (proto != IPPROTO_IPV6 && proto != IPPROTO_IPIP && proto != 0) return -EINVAL; return 0; } static void ip6_tnl_netlink_parms(struct nlattr *data[], struct __ip6_tnl_parm *parms) { memset(parms, 0, sizeof(*parms)); if (!data) return; if (data[IFLA_IPTUN_LINK]) parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]); if (data[IFLA_IPTUN_LOCAL]) parms->laddr = nla_get_in6_addr(data[IFLA_IPTUN_LOCAL]); if (data[IFLA_IPTUN_REMOTE]) parms->raddr = nla_get_in6_addr(data[IFLA_IPTUN_REMOTE]); if (data[IFLA_IPTUN_TTL]) parms->hop_limit = nla_get_u8(data[IFLA_IPTUN_TTL]); if (data[IFLA_IPTUN_ENCAP_LIMIT]) parms->encap_limit = nla_get_u8(data[IFLA_IPTUN_ENCAP_LIMIT]); if (data[IFLA_IPTUN_FLOWINFO]) parms->flowinfo = nla_get_be32(data[IFLA_IPTUN_FLOWINFO]); if (data[IFLA_IPTUN_FLAGS]) parms->flags = nla_get_u32(data[IFLA_IPTUN_FLAGS]); if (data[IFLA_IPTUN_PROTO]) parms->proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); if (data[IFLA_IPTUN_COLLECT_METADATA]) parms->collect_md = true; if (data[IFLA_IPTUN_FWMARK]) parms->fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]); } static int ip6_tnl_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct nlattr **data = params->data; struct nlattr **tb = params->tb; struct ip_tunnel_encap ipencap; struct ip6_tnl_net *ip6n; struct ip6_tnl *nt, *t; struct net *net; int err; net = params->link_net ? : dev_net(dev); ip6n = net_generic(net, ip6_tnl_net_id); nt = netdev_priv(dev); nt->net = net; if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { err = ip6_tnl_encap_setup(nt, &ipencap); if (err < 0) return err; } ip6_tnl_netlink_parms(data, &nt->parms); if (nt->parms.collect_md) { if (rtnl_dereference(ip6n->collect_md_tun)) return -EEXIST; } else { t = ip6_tnl_locate(net, &nt->parms, 0); if (!IS_ERR(t)) return -EEXIST; } err = ip6_tnl_create2(dev); if (!err && tb[IFLA_MTU]) ip6_tnl_change_mtu(dev, nla_get_u32(tb[IFLA_MTU])); return err; } static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ip6_tnl *t = netdev_priv(dev); struct __ip6_tnl_parm p; struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct ip_tunnel_encap ipencap; if (dev == ip6n->fb_tnl_dev) return -EINVAL; if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { int err = ip6_tnl_encap_setup(t, &ipencap); if (err < 0) return err; } ip6_tnl_netlink_parms(data, &p); if (p.collect_md) return -EINVAL; t = ip6_tnl_locate(net, &p, 0); if (!IS_ERR(t)) { if (t->dev != dev) return -EEXIST; } else t = netdev_priv(dev); ip6_tnl_update(t, &p); return 0; } static void ip6_tnl_dellink(struct net_device *dev, struct list_head *head) { struct net *net = dev_net(dev); struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); if (dev != ip6n->fb_tnl_dev) unregister_netdevice_queue(dev, head); } static size_t ip6_tnl_get_size(const struct net_device *dev) { return /* IFLA_IPTUN_LINK */ nla_total_size(4) + /* IFLA_IPTUN_LOCAL */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_IPTUN_REMOTE */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_IPTUN_TTL */ nla_total_size(1) + /* IFLA_IPTUN_ENCAP_LIMIT */ nla_total_size(1) + /* IFLA_IPTUN_FLOWINFO */ nla_total_size(4) + /* IFLA_IPTUN_FLAGS */ nla_total_size(4) + /* IFLA_IPTUN_PROTO */ nla_total_size(1) + /* IFLA_IPTUN_ENCAP_TYPE */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_FLAGS */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_SPORT */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_DPORT */ nla_total_size(2) + /* IFLA_IPTUN_COLLECT_METADATA */ nla_total_size(0) + /* IFLA_IPTUN_FWMARK */ nla_total_size(4) + 0; } static int ip6_tnl_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip6_tnl *tunnel = netdev_priv(dev); struct __ip6_tnl_parm *parm = &tunnel->parms; if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) || nla_put_in6_addr(skb, IFLA_IPTUN_LOCAL, &parm->laddr) || nla_put_in6_addr(skb, IFLA_IPTUN_REMOTE, &parm->raddr) || nla_put_u8(skb, IFLA_IPTUN_TTL, parm->hop_limit) || nla_put_u8(skb, IFLA_IPTUN_ENCAP_LIMIT, parm->encap_limit) || nla_put_be32(skb, IFLA_IPTUN_FLOWINFO, parm->flowinfo) || nla_put_u32(skb, IFLA_IPTUN_FLAGS, parm->flags) || nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->proto) || nla_put_u32(skb, IFLA_IPTUN_FWMARK, parm->fwmark)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, tunnel->encap.type) || nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT, tunnel->encap.sport) || nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT, tunnel->encap.dport) || nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, tunnel->encap.flags)) goto nla_put_failure; if (parm->collect_md) if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } struct net *ip6_tnl_get_link_net(const struct net_device *dev) { struct ip6_tnl *tunnel = netdev_priv(dev); return READ_ONCE(tunnel->net); } EXPORT_SYMBOL(ip6_tnl_get_link_net); static const struct nla_policy ip6_tnl_policy[IFLA_IPTUN_MAX + 1] = { [IFLA_IPTUN_LINK] = { .type = NLA_U32 }, [IFLA_IPTUN_LOCAL] = { .len = sizeof(struct in6_addr) }, [IFLA_IPTUN_REMOTE] = { .len = sizeof(struct in6_addr) }, [IFLA_IPTUN_TTL] = { .type = NLA_U8 }, [IFLA_IPTUN_ENCAP_LIMIT] = { .type = NLA_U8 }, [IFLA_IPTUN_FLOWINFO] = { .type = NLA_U32 }, [IFLA_IPTUN_FLAGS] = { .type = NLA_U32 }, [IFLA_IPTUN_PROTO] = { .type = NLA_U8 }, [IFLA_IPTUN_ENCAP_TYPE] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_COLLECT_METADATA] = { .type = NLA_FLAG }, [IFLA_IPTUN_FWMARK] = { .type = NLA_U32 }, }; static struct rtnl_link_ops ip6_link_ops __read_mostly = { .kind = "ip6tnl", .maxtype = IFLA_IPTUN_MAX, .policy = ip6_tnl_policy, .priv_size = sizeof(struct ip6_tnl), .setup = ip6_tnl_dev_setup, .validate = ip6_tnl_validate, .newlink = ip6_tnl_newlink, .changelink = ip6_tnl_changelink, .dellink = ip6_tnl_dellink, .get_size = ip6_tnl_get_size, .fill_info = ip6_tnl_fill_info, .get_link_net = ip6_tnl_get_link_net, }; static struct xfrm6_tunnel ip4ip6_handler __read_mostly = { .handler = ip4ip6_rcv, .err_handler = ip4ip6_err, .priority = 1, }; static struct xfrm6_tunnel ip6ip6_handler __read_mostly = { .handler = ip6ip6_rcv, .err_handler = ip6ip6_err, .priority = 1, }; static struct xfrm6_tunnel mplsip6_handler __read_mostly = { .handler = mplsip6_rcv, .err_handler = mplsip6_err, .priority = 1, }; static void __net_exit ip6_tnl_exit_rtnl_net(struct net *net, struct list_head *list) { struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct net_device *dev, *aux; int h; struct ip6_tnl *t; for_each_netdev_safe(net, dev, aux) if (dev->rtnl_link_ops == &ip6_link_ops) unregister_netdevice_queue(dev, list); for (h = 0; h < IP6_TUNNEL_HASH_SIZE; h++) { t = rtnl_net_dereference(net, ip6n->tnls_r_l[h]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) unregister_netdevice_queue(t->dev, list); t = rtnl_net_dereference(net, t->next); } } t = rtnl_net_dereference(net, ip6n->tnls_wc[0]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) unregister_netdevice_queue(t->dev, list); t = rtnl_net_dereference(net, t->next); } } static int __net_init ip6_tnl_init_net(struct net *net) { struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct ip6_tnl *t = NULL; int err; ip6n->tnls[0] = ip6n->tnls_wc; ip6n->tnls[1] = ip6n->tnls_r_l; if (!net_has_fallback_tunnels(net)) return 0; err = -ENOMEM; ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6tnl0", NET_NAME_UNKNOWN, ip6_tnl_dev_setup); if (!ip6n->fb_tnl_dev) goto err_alloc_dev; dev_net_set(ip6n->fb_tnl_dev, net); ip6n->fb_tnl_dev->rtnl_link_ops = &ip6_link_ops; /* FB netdevice is special: we have one, and only one per netns. * Allowing to move it to another netns is clearly unsafe. */ ip6n->fb_tnl_dev->netns_immutable = true; err = ip6_fb_tnl_dev_init(ip6n->fb_tnl_dev); if (err < 0) goto err_register; err = register_netdev(ip6n->fb_tnl_dev); if (err < 0) goto err_register; t = netdev_priv(ip6n->fb_tnl_dev); strcpy(t->parms.name, ip6n->fb_tnl_dev->name); return 0; err_register: free_netdev(ip6n->fb_tnl_dev); err_alloc_dev: return err; } static struct pernet_operations ip6_tnl_net_ops = { .init = ip6_tnl_init_net, .exit_rtnl = ip6_tnl_exit_rtnl_net, .id = &ip6_tnl_net_id, .size = sizeof(struct ip6_tnl_net), }; /** * ip6_tunnel_init - register protocol and reserve needed resources * * Return: 0 on success **/ static int __init ip6_tunnel_init(void) { int err; if (!ipv6_mod_enabled()) return -EOPNOTSUPP; err = register_pernet_device(&ip6_tnl_net_ops); if (err < 0) goto out_pernet; err = xfrm6_tunnel_register(&ip4ip6_handler, AF_INET); if (err < 0) { pr_err("%s: can't register ip4ip6\n", __func__); goto out_ip4ip6; } err = xfrm6_tunnel_register(&ip6ip6_handler, AF_INET6); if (err < 0) { pr_err("%s: can't register ip6ip6\n", __func__); goto out_ip6ip6; } if (ip6_tnl_mpls_supported()) { err = xfrm6_tunnel_register(&mplsip6_handler, AF_MPLS); if (err < 0) { pr_err("%s: can't register mplsip6\n", __func__); goto out_mplsip6; } } err = rtnl_link_register(&ip6_link_ops); if (err < 0) goto rtnl_link_failed; return 0; rtnl_link_failed: if (ip6_tnl_mpls_supported()) xfrm6_tunnel_deregister(&mplsip6_handler, AF_MPLS); out_mplsip6: xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6); out_ip6ip6: xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET); out_ip4ip6: unregister_pernet_device(&ip6_tnl_net_ops); out_pernet: return err; } /** * ip6_tunnel_cleanup - free resources and unregister protocol **/ static void __exit ip6_tunnel_cleanup(void) { rtnl_link_unregister(&ip6_link_ops); if (xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET)) pr_info("%s: can't deregister ip4ip6\n", __func__); if (xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6)) pr_info("%s: can't deregister ip6ip6\n", __func__); if (ip6_tnl_mpls_supported() && xfrm6_tunnel_deregister(&mplsip6_handler, AF_MPLS)) pr_info("%s: can't deregister mplsip6\n", __func__); unregister_pernet_device(&ip6_tnl_net_ops); } module_init(ip6_tunnel_init); module_exit(ip6_tunnel_cleanup);
1063 1066 1045 18 2 1045 1 1 1044 106 105 106 1154 12 1143 1 1144 1125 238 920 1134 1339 1345 5 65 3 65 65 1272 752 918 1328 443 1342 1342 43 404 1345 116 1344 100 1137 1183 118 664 1143 1141 1146 1 191 12 1144 1142 5 5 1142 7 1141 1142 1143 1142 1135 474 706 28 10 10 1135 1136 5 530 1 1128 4 1129 1124 4 13 1 2 10 912 971 725 10 254 265 972 408 576 827 780 780 780 88 823 823 87 559 560 680 582 664 583 194 87 84 87 4 84 84 4 86 87 696 885 758 756 698 911 698 697 84 696 50 3 318 5 38 6 111 111 92 8 8 1 40 1075 656 906 581 10 565 148 145 22 772 774 766 12 570 1081 678 679 678 665 583 498 632 3 168 101 550 579 560 583 102 679 560 981 982 49 949 682 866 662 84 677 498 3 3 3 3 1 2 2 2 135 130 2 3 3 2 5 5 752 8 119 3 861 570 112 672 2 5545 4482 4943 2 107 397 1142 65 735 8 1081 111 397 876 5 123 1122 1105 18 1102 663 854 42 664 837 3 3 1141 1144 13 1141 1145 13 13 726 346 344 345 3 11 95 96 92 421 122 8 1 1 152 150 1 61 126 106 88 17 87 3 84 83 82 34 49 84 7 112 125 10 116 116 7 7 7 110 116 73 1 228 228 225 219 228 227 9 219 13 4 4 4 4 4 86 86 87 232 886 103 10 10 100 2 773 39 879 2 881 58 6 138 6 683 687 4 5 543 143 224 471 544 143 143 1 684 671 14 554 120 2 118 669 712 1 209 888 6 827 74 1 6 908 1 314 124 544 129 17 471 227 146 544 690 689 558 118 674 146 546 472 227 16 557 118 789 10 783 21 118 790 118 790 703 203 3 194 881 118 726 59 51 874 828 58 880 836 58 8 1 1 6 6 2 4 10 2 2 6 59 16 47 977 10 58 68 914 890 136 916 967 967 965 12 965 966 12 12 12 12 12 12 125 17 1 49 89 49 27 27 4 87 11 24 24 1 23 23 23 63 62 63 4 60 60 88 55 63 60 4 63 63 88 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/namei.c * * Copyright (C) 1991, 1992 Linus Torvalds */ /* * Some corrections by tytso. */ /* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname * lookup logic. */ /* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture. */ #include <linux/init.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/wordpart.h> #include <linux/fs.h> #include <linux/filelock.h> #include <linux/namei.h> #include <linux/pagemap.h> #include <linux/sched/mm.h> #include <linux/fsnotify.h> #include <linux/personality.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/mount.h> #include <linux/audit.h> #include <linux/capability.h> #include <linux/file.h> #include <linux/fcntl.h> #include <linux/device_cgroup.h> #include <linux/fs_struct.h> #include <linux/posix_acl.h> #include <linux/hash.h> #include <linux/bitops.h> #include <linux/init_task.h> #include <linux/uaccess.h> #include "internal.h" #include "mount.h" /* [Feb-1997 T. Schoebel-Theuer] * Fundamental changes in the pathname lookup mechanisms (namei) * were necessary because of omirr. The reason is that omirr needs * to know the _real_ pathname, not the user-supplied one, in case * of symlinks (and also when transname replacements occur). * * The new code replaces the old recursive symlink resolution with * an iterative one (in case of non-nested symlink chains). It does * this with calls to <fs>_follow_link(). * As a side effect, dir_namei(), _namei() and follow_link() are now * replaced with a single function lookup_dentry() that can handle all * the special cases of the former code. * * With the new dcache, the pathname is stored at each inode, at least as * long as the refcount of the inode is positive. As a side effect, the * size of the dcache depends on the inode cache and thus is dynamic. * * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink * resolution to correspond with current state of the code. * * Note that the symlink resolution is not *completely* iterative. * There is still a significant amount of tail- and mid- recursion in * the algorithm. Also, note that <fs>_readlink() is not used in * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink() * may return different results than <fs>_follow_link(). Many virtual * filesystems (including /proc) exhibit this behavior. */ /* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation: * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL * and the name already exists in form of a symlink, try to create the new * name indicated by the symlink. The old code always complained that the * name already exists, due to not following the symlink even if its target * is nonexistent. The new semantics affects also mknod() and link() when * the name is a symlink pointing to a non-existent name. * * I don't know which semantics is the right one, since I have no access * to standards. But I found by trial that HP-UX 9.0 has the full "new" * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the * "old" one. Personally, I think the new semantics is much more logical. * Note that "ln old new" where "new" is a symlink pointing to a non-existing * file does succeed in both HP-UX and SunOs, but not in Solaris * and in the old Linux semantics. */ /* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink * semantics. See the comments in "open_namei" and "do_link" below. * * [10-Sep-98 Alan Modra] Another symlink change. */ /* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks: * inside the path - always follow. * in the last component in creation/removal/renaming - never follow. * if LOOKUP_FOLLOW passed - follow. * if the pathname has trailing slashes - follow. * otherwise - don't follow. * (applied in that order). * * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT * restored for 2.4. This is the last surviving part of old 4.2BSD bug. * During the 2.4 we need to fix the userland stuff depending on it - * hopefully we will be able to get rid of that wart in 2.5. So far only * XEmacs seems to be relying on it... */ /* * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland) * implemented. Let's see if raised priority of ->s_vfs_rename_mutex gives * any extra contention... */ /* In order to reduce some races, while at the same time doing additional * checking and hopefully speeding things up, we copy filenames to the * kernel data space before using them.. * * POSIX.1 2.4: an empty pathname is invalid (ENOENT). * PATH_MAX includes the nul terminator --RR. */ #define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname)) static inline void initname(struct filename *name, const char __user *uptr) { name->uptr = uptr; name->aname = NULL; atomic_set(&name->refcnt, 1); } struct filename * getname_flags(const char __user *filename, int flags) { struct filename *result; char *kname; int len; result = audit_reusename(filename); if (result) return result; result = __getname(); if (unlikely(!result)) return ERR_PTR(-ENOMEM); /* * First, try to embed the struct filename inside the names_cache * allocation */ kname = (char *)result->iname; result->name = kname; len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX); /* * Handle both empty path and copy failure in one go. */ if (unlikely(len <= 0)) { if (unlikely(len < 0)) { __putname(result); return ERR_PTR(len); } /* The empty path is special. */ if (!(flags & LOOKUP_EMPTY)) { __putname(result); return ERR_PTR(-ENOENT); } } /* * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a * separate struct filename so we can dedicate the entire * names_cache allocation for the pathname, and re-do the copy from * userland. */ if (unlikely(len == EMBEDDED_NAME_MAX)) { const size_t size = offsetof(struct filename, iname[1]); kname = (char *)result; /* * size is chosen that way we to guarantee that * result->iname[0] is within the same object and that * kname can't be equal to result->iname, no matter what. */ result = kzalloc(size, GFP_KERNEL); if (unlikely(!result)) { __putname(kname); return ERR_PTR(-ENOMEM); } result->name = kname; len = strncpy_from_user(kname, filename, PATH_MAX); if (unlikely(len < 0)) { __putname(kname); kfree(result); return ERR_PTR(len); } /* The empty path is special. */ if (unlikely(!len) && !(flags & LOOKUP_EMPTY)) { __putname(kname); kfree(result); return ERR_PTR(-ENOENT); } if (unlikely(len == PATH_MAX)) { __putname(kname); kfree(result); return ERR_PTR(-ENAMETOOLONG); } } initname(result, filename); audit_getname(result); return result; } struct filename *getname_uflags(const char __user *filename, int uflags) { int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0; return getname_flags(filename, flags); } struct filename *__getname_maybe_null(const char __user *pathname) { struct filename *name; char c; /* try to save on allocations; loss on um, though */ if (get_user(c, pathname)) return ERR_PTR(-EFAULT); if (!c) return NULL; name = getname_flags(pathname, LOOKUP_EMPTY); if (!IS_ERR(name) && !(name->name[0])) { putname(name); name = NULL; } return name; } struct filename *getname_kernel(const char * filename) { struct filename *result; int len = strlen(filename) + 1; result = __getname(); if (unlikely(!result)) return ERR_PTR(-ENOMEM); if (len <= EMBEDDED_NAME_MAX) { result->name = (char *)result->iname; } else if (len <= PATH_MAX) { const size_t size = offsetof(struct filename, iname[1]); struct filename *tmp; tmp = kmalloc(size, GFP_KERNEL); if (unlikely(!tmp)) { __putname(result); return ERR_PTR(-ENOMEM); } tmp->name = (char *)result; result = tmp; } else { __putname(result); return ERR_PTR(-ENAMETOOLONG); } memcpy((char *)result->name, filename, len); initname(result, NULL); audit_getname(result); return result; } EXPORT_SYMBOL(getname_kernel); void putname(struct filename *name) { int refcnt; if (IS_ERR_OR_NULL(name)) return; refcnt = atomic_read(&name->refcnt); if (refcnt != 1) { if (WARN_ON_ONCE(!refcnt)) return; if (!atomic_dec_and_test(&name->refcnt)) return; } if (name->name != name->iname) { __putname(name->name); kfree(name); } else __putname(name); } EXPORT_SYMBOL(putname); /** * check_acl - perform ACL permission checking * @idmap: idmap of the mount the inode was found from * @inode: inode to check permissions on * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) * * This function performs the ACL permission checking. Since this function * retrieve POSIX acls it needs to know whether it is called from a blocking or * non-blocking context and thus cares about the MAY_NOT_BLOCK bit. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ static int check_acl(struct mnt_idmap *idmap, struct inode *inode, int mask) { #ifdef CONFIG_FS_POSIX_ACL struct posix_acl *acl; if (mask & MAY_NOT_BLOCK) { acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS); if (!acl) return -EAGAIN; /* no ->get_inode_acl() calls in RCU mode... */ if (is_uncached_acl(acl)) return -ECHILD; return posix_acl_permission(idmap, inode, acl, mask); } acl = get_inode_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl) { int error = posix_acl_permission(idmap, inode, acl, mask); posix_acl_release(acl); return error; } #endif return -EAGAIN; } /* * Very quick optimistic "we know we have no ACL's" check. * * Note that this is purely for ACL_TYPE_ACCESS, and purely * for the "we have cached that there are no ACLs" case. * * If this returns true, we know there are no ACLs. But if * it returns false, we might still not have ACLs (it could * be the is_uncached_acl() case). */ static inline bool no_acl_inode(struct inode *inode) { #ifdef CONFIG_FS_POSIX_ACL return likely(!READ_ONCE(inode->i_acl)); #else return true; #endif } /** * acl_permission_check - perform basic UNIX permission checking * @idmap: idmap of the mount the inode was found from * @inode: inode to check permissions on * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) * * This function performs the basic UNIX permission checking. Since this * function may retrieve POSIX acls it needs to know whether it is called from a * blocking or non-blocking context and thus cares about the MAY_NOT_BLOCK bit. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ static int acl_permission_check(struct mnt_idmap *idmap, struct inode *inode, int mask) { unsigned int mode = inode->i_mode; vfsuid_t vfsuid; /* * Common cheap case: everybody has the requested * rights, and there are no ACLs to check. No need * to do any owner/group checks in that case. * * - 'mask&7' is the requested permission bit set * - multiplying by 0111 spreads them out to all of ugo * - '& ~mode' looks for missing inode permission bits * - the '!' is for "no missing permissions" * * After that, we just need to check that there are no * ACL's on the inode - do the 'IS_POSIXACL()' check last * because it will dereference the ->i_sb pointer and we * want to avoid that if at all possible. */ if (!((mask & 7) * 0111 & ~mode)) { if (no_acl_inode(inode)) return 0; if (!IS_POSIXACL(inode)) return 0; } /* Are we the owner? If so, ACL's don't matter */ vfsuid = i_uid_into_vfsuid(idmap, inode); if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) { mask &= 7; mode >>= 6; return (mask & ~mode) ? -EACCES : 0; } /* Do we have ACL's? */ if (IS_POSIXACL(inode) && (mode & S_IRWXG)) { int error = check_acl(idmap, inode, mask); if (error != -EAGAIN) return error; } /* Only RWX matters for group/other mode bits */ mask &= 7; /* * Are the group permissions different from * the other permissions in the bits we care * about? Need to check group ownership if so. */ if (mask & (mode ^ (mode >> 3))) { vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); if (vfsgid_in_group_p(vfsgid)) mode >>= 3; } /* Bits in 'mode' clear that we require? */ return (mask & ~mode) ? -EACCES : 0; } /** * generic_permission - check for access rights on a Posix-like filesystem * @idmap: idmap of the mount the inode was found from * @inode: inode to check access rights for * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, * %MAY_NOT_BLOCK ...) * * Used to check for read/write/execute permissions on a file. * We use "fsuid" for this, letting us set arbitrary permissions * for filesystem access without changing the "normal" uids which * are used for other things. * * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk * request cannot be satisfied (eg. requires blocking or too much complexity). * It would then be called again in ref-walk mode. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int generic_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { int ret; /* * Do the basic permission checks. */ ret = acl_permission_check(idmap, inode, mask); if (ret != -EACCES) return ret; if (S_ISDIR(inode->i_mode)) { /* DACs are overridable for directories */ if (!(mask & MAY_WRITE)) if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_READ_SEARCH)) return 0; if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_OVERRIDE)) return 0; return -EACCES; } /* * Searching includes executable on directories, else just read. */ mask &= MAY_READ | MAY_WRITE | MAY_EXEC; if (mask == MAY_READ) if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_READ_SEARCH)) return 0; /* * Read/write DACs are always overridable. * Executable DACs are overridable when there is * at least one exec bit set. */ if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO)) if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_OVERRIDE)) return 0; return -EACCES; } EXPORT_SYMBOL(generic_permission); /** * do_inode_permission - UNIX permission checking * @idmap: idmap of the mount the inode was found from * @inode: inode to check permissions on * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) * * We _really_ want to just do "generic_permission()" without * even looking at the inode->i_op values. So we keep a cache * flag in inode->i_opflags, that says "this has not special * permission function, use the fast case". */ static inline int do_inode_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) { if (likely(inode->i_op->permission)) return inode->i_op->permission(idmap, inode, mask); /* This gets set once for the inode lifetime */ spin_lock(&inode->i_lock); inode->i_opflags |= IOP_FASTPERM; spin_unlock(&inode->i_lock); } return generic_permission(idmap, inode, mask); } /** * sb_permission - Check superblock-level permissions * @sb: Superblock of inode to check permission on * @inode: Inode to check permission on * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) * * Separate out file-system wide checks from inode-specific permission checks. */ static int sb_permission(struct super_block *sb, struct inode *inode, int mask) { if (unlikely(mask & MAY_WRITE)) { umode_t mode = inode->i_mode; /* Nobody gets write access to a read-only fs. */ if (sb_rdonly(sb) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) return -EROFS; } return 0; } /** * inode_permission - Check for access rights to a given inode * @idmap: idmap of the mount the inode was found from * @inode: Inode to check permission on * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) * * Check for read/write/execute permissions on an inode. We use fs[ug]id for * this, letting us set arbitrary permissions for filesystem access without * changing the "normal" UIDs which are used for other things. * * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask. */ int inode_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { int retval; retval = sb_permission(inode->i_sb, inode, mask); if (retval) return retval; if (unlikely(mask & MAY_WRITE)) { /* * Nobody gets write access to an immutable file. */ if (IS_IMMUTABLE(inode)) return -EPERM; /* * Updating mtime will likely cause i_uid and i_gid to be * written back improperly if their true value is unknown * to the vfs. */ if (HAS_UNMAPPED_ID(idmap, inode)) return -EACCES; } retval = do_inode_permission(idmap, inode, mask); if (retval) return retval; retval = devcgroup_inode_permission(inode, mask); if (retval) return retval; return security_inode_permission(inode, mask); } EXPORT_SYMBOL(inode_permission); /** * path_get - get a reference to a path * @path: path to get the reference to * * Given a path increment the reference count to the dentry and the vfsmount. */ void path_get(const struct path *path) { mntget(path->mnt); dget(path->dentry); } EXPORT_SYMBOL(path_get); /** * path_put - put a reference to a path * @path: path to put the reference to * * Given a path decrement the reference count to the dentry and the vfsmount. */ void path_put(const struct path *path) { dput(path->dentry); mntput(path->mnt); } EXPORT_SYMBOL(path_put); #define EMBEDDED_LEVELS 2 struct nameidata { struct path path; struct qstr last; struct path root; struct inode *inode; /* path.dentry.d_inode */ unsigned int flags, state; unsigned seq, next_seq, m_seq, r_seq; int last_type; unsigned depth; int total_link_count; struct saved { struct path link; struct delayed_call done; const char *name; unsigned seq; } *stack, internal[EMBEDDED_LEVELS]; struct filename *name; const char *pathname; struct nameidata *saved; unsigned root_seq; int dfd; vfsuid_t dir_vfsuid; umode_t dir_mode; } __randomize_layout; #define ND_ROOT_PRESET 1 #define ND_ROOT_GRABBED 2 #define ND_JUMPED 4 static void __set_nameidata(struct nameidata *p, int dfd, struct filename *name) { struct nameidata *old = current->nameidata; p->stack = p->internal; p->depth = 0; p->dfd = dfd; p->name = name; p->pathname = likely(name) ? name->name : ""; p->path.mnt = NULL; p->path.dentry = NULL; p->total_link_count = old ? old->total_link_count : 0; p->saved = old; current->nameidata = p; } static inline void set_nameidata(struct nameidata *p, int dfd, struct filename *name, const struct path *root) { __set_nameidata(p, dfd, name); p->state = 0; if (unlikely(root)) { p->state = ND_ROOT_PRESET; p->root = *root; } } static void restore_nameidata(void) { struct nameidata *now = current->nameidata, *old = now->saved; current->nameidata = old; if (old) old->total_link_count = now->total_link_count; if (now->stack != now->internal) kfree(now->stack); } static bool nd_alloc_stack(struct nameidata *nd) { struct saved *p; p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved), nd->flags & LOOKUP_RCU ? GFP_ATOMIC : GFP_KERNEL); if (unlikely(!p)) return false; memcpy(p, nd->internal, sizeof(nd->internal)); nd->stack = p; return true; } /** * path_connected - Verify that a dentry is below mnt.mnt_root * @mnt: The mountpoint to check. * @dentry: The dentry to check. * * Rename can sometimes move a file or directory outside of a bind * mount, path_connected allows those cases to be detected. */ static bool path_connected(struct vfsmount *mnt, struct dentry *dentry) { struct super_block *sb = mnt->mnt_sb; /* Bind mounts can have disconnected paths */ if (mnt->mnt_root == sb->s_root) return true; return is_subdir(dentry, mnt->mnt_root); } static void drop_links(struct nameidata *nd) { int i = nd->depth; while (i--) { struct saved *last = nd->stack + i; do_delayed_call(&last->done); clear_delayed_call(&last->done); } } static void leave_rcu(struct nameidata *nd) { nd->flags &= ~LOOKUP_RCU; nd->seq = nd->next_seq = 0; rcu_read_unlock(); } static void terminate_walk(struct nameidata *nd) { drop_links(nd); if (!(nd->flags & LOOKUP_RCU)) { int i; path_put(&nd->path); for (i = 0; i < nd->depth; i++) path_put(&nd->stack[i].link); if (nd->state & ND_ROOT_GRABBED) { path_put(&nd->root); nd->state &= ~ND_ROOT_GRABBED; } } else { leave_rcu(nd); } nd->depth = 0; nd->path.mnt = NULL; nd->path.dentry = NULL; } /* path_put is needed afterwards regardless of success or failure */ static bool __legitimize_path(struct path *path, unsigned seq, unsigned mseq) { int res = __legitimize_mnt(path->mnt, mseq); if (unlikely(res)) { if (res > 0) path->mnt = NULL; path->dentry = NULL; return false; } if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) { path->dentry = NULL; return false; } return !read_seqcount_retry(&path->dentry->d_seq, seq); } static inline bool legitimize_path(struct nameidata *nd, struct path *path, unsigned seq) { return __legitimize_path(path, seq, nd->m_seq); } static bool legitimize_links(struct nameidata *nd) { int i; if (unlikely(nd->flags & LOOKUP_CACHED)) { drop_links(nd); nd->depth = 0; return false; } for (i = 0; i < nd->depth; i++) { struct saved *last = nd->stack + i; if (unlikely(!legitimize_path(nd, &last->link, last->seq))) { drop_links(nd); nd->depth = i + 1; return false; } } return true; } static bool legitimize_root(struct nameidata *nd) { /* Nothing to do if nd->root is zero or is managed by the VFS user. */ if (!nd->root.mnt || (nd->state & ND_ROOT_PRESET)) return true; nd->state |= ND_ROOT_GRABBED; return legitimize_path(nd, &nd->root, nd->root_seq); } /* * Path walking has 2 modes, rcu-walk and ref-walk (see * Documentation/filesystems/path-lookup.txt). In situations when we can't * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab * normal reference counts on dentries and vfsmounts to transition to ref-walk * mode. Refcounts are grabbed at the last known good point before rcu-walk * got stuck, so ref-walk may continue from there. If this is not successful * (eg. a seqcount has changed), then failure is returned and it's up to caller * to restart the path walk from the beginning in ref-walk mode. */ /** * try_to_unlazy - try to switch to ref-walk mode. * @nd: nameidata pathwalk data * Returns: true on success, false on failure * * try_to_unlazy attempts to legitimize the current nd->path and nd->root * for ref-walk mode. * Must be called from rcu-walk context. * Nothing should touch nameidata between try_to_unlazy() failure and * terminate_walk(). */ static bool try_to_unlazy(struct nameidata *nd) { struct dentry *parent = nd->path.dentry; BUG_ON(!(nd->flags & LOOKUP_RCU)); if (unlikely(!legitimize_links(nd))) goto out1; if (unlikely(!legitimize_path(nd, &nd->path, nd->seq))) goto out; if (unlikely(!legitimize_root(nd))) goto out; leave_rcu(nd); BUG_ON(nd->inode != parent->d_inode); return true; out1: nd->path.mnt = NULL; nd->path.dentry = NULL; out: leave_rcu(nd); return false; } /** * try_to_unlazy_next - try to switch to ref-walk mode. * @nd: nameidata pathwalk data * @dentry: next dentry to step into * Returns: true on success, false on failure * * Similar to try_to_unlazy(), but here we have the next dentry already * picked by rcu-walk and want to legitimize that in addition to the current * nd->path and nd->root for ref-walk mode. Must be called from rcu-walk context. * Nothing should touch nameidata between try_to_unlazy_next() failure and * terminate_walk(). */ static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry) { int res; BUG_ON(!(nd->flags & LOOKUP_RCU)); if (unlikely(!legitimize_links(nd))) goto out2; res = __legitimize_mnt(nd->path.mnt, nd->m_seq); if (unlikely(res)) { if (res > 0) goto out2; goto out1; } if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref))) goto out1; /* * We need to move both the parent and the dentry from the RCU domain * to be properly refcounted. And the sequence number in the dentry * validates *both* dentry counters, since we checked the sequence * number of the parent after we got the child sequence number. So we * know the parent must still be valid if the child sequence number is */ if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) goto out; if (read_seqcount_retry(&dentry->d_seq, nd->next_seq)) goto out_dput; /* * Sequence counts matched. Now make sure that the root is * still valid and get it if required. */ if (unlikely(!legitimize_root(nd))) goto out_dput; leave_rcu(nd); return true; out2: nd->path.mnt = NULL; out1: nd->path.dentry = NULL; out: leave_rcu(nd); return false; out_dput: leave_rcu(nd); dput(dentry); return false; } static inline int d_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags) { if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) return dentry->d_op->d_revalidate(dir, name, dentry, flags); else return 1; } /** * complete_walk - successful completion of path walk * @nd: pointer nameidata * * If we had been in RCU mode, drop out of it and legitimize nd->path. * Revalidate the final result, unless we'd already done that during * the path walk or the filesystem doesn't ask for it. Return 0 on * success, -error on failure. In case of failure caller does not * need to drop nd->path. */ static int complete_walk(struct nameidata *nd) { struct dentry *dentry = nd->path.dentry; int status; if (nd->flags & LOOKUP_RCU) { /* * We don't want to zero nd->root for scoped-lookups or * externally-managed nd->root. */ if (!(nd->state & ND_ROOT_PRESET)) if (!(nd->flags & LOOKUP_IS_SCOPED)) nd->root.mnt = NULL; nd->flags &= ~LOOKUP_CACHED; if (!try_to_unlazy(nd)) return -ECHILD; } if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) { /* * While the guarantee of LOOKUP_IS_SCOPED is (roughly) "don't * ever step outside the root during lookup" and should already * be guaranteed by the rest of namei, we want to avoid a namei * BUG resulting in userspace being given a path that was not * scoped within the root at some point during the lookup. * * So, do a final sanity-check to make sure that in the * worst-case scenario (a complete bypass of LOOKUP_IS_SCOPED) * we won't silently return an fd completely outside of the * requested root to userspace. * * Userspace could move the path outside the root after this * check, but as discussed elsewhere this is not a concern (the * resolved file was inside the root at some point). */ if (!path_is_under(&nd->path, &nd->root)) return -EXDEV; } if (likely(!(nd->state & ND_JUMPED))) return 0; if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE))) return 0; status = dentry->d_op->d_weak_revalidate(dentry, nd->flags); if (status > 0) return 0; if (!status) status = -ESTALE; return status; } static int set_root(struct nameidata *nd) { struct fs_struct *fs = current->fs; /* * Jumping to the real root in a scoped-lookup is a BUG in namei, but we * still have to ensure it doesn't happen because it will cause a breakout * from the dirfd. */ if (WARN_ON(nd->flags & LOOKUP_IS_SCOPED)) return -ENOTRECOVERABLE; if (nd->flags & LOOKUP_RCU) { unsigned seq; do { seq = read_seqcount_begin(&fs->seq); nd->root = fs->root; nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq); } while (read_seqcount_retry(&fs->seq, seq)); } else { get_fs_root(fs, &nd->root); nd->state |= ND_ROOT_GRABBED; } return 0; } static int nd_jump_root(struct nameidata *nd) { if (unlikely(nd->flags & LOOKUP_BENEATH)) return -EXDEV; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) { /* Absolute path arguments to path_init() are allowed. */ if (nd->path.mnt != NULL && nd->path.mnt != nd->root.mnt) return -EXDEV; } if (!nd->root.mnt) { int error = set_root(nd); if (error) return error; } if (nd->flags & LOOKUP_RCU) { struct dentry *d; nd->path = nd->root; d = nd->path.dentry; nd->inode = d->d_inode; nd->seq = nd->root_seq; if (read_seqcount_retry(&d->d_seq, nd->seq)) return -ECHILD; } else { path_put(&nd->path); nd->path = nd->root; path_get(&nd->path); nd->inode = nd->path.dentry->d_inode; } nd->state |= ND_JUMPED; return 0; } /* * Helper to directly jump to a known parsed path from ->get_link, * caller must have taken a reference to path beforehand. */ int nd_jump_link(const struct path *path) { int error = -ELOOP; struct nameidata *nd = current->nameidata; if (unlikely(nd->flags & LOOKUP_NO_MAGICLINKS)) goto err; error = -EXDEV; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) { if (nd->path.mnt != path->mnt) goto err; } /* Not currently safe for scoped-lookups. */ if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) goto err; path_put(&nd->path); nd->path = *path; nd->inode = nd->path.dentry->d_inode; nd->state |= ND_JUMPED; return 0; err: path_put(path); return error; } static inline void put_link(struct nameidata *nd) { struct saved *last = nd->stack + --nd->depth; do_delayed_call(&last->done); if (!(nd->flags & LOOKUP_RCU)) path_put(&last->link); } static int sysctl_protected_symlinks __read_mostly; static int sysctl_protected_hardlinks __read_mostly; static int sysctl_protected_fifos __read_mostly; static int sysctl_protected_regular __read_mostly; #ifdef CONFIG_SYSCTL static const struct ctl_table namei_sysctls[] = { { .procname = "protected_symlinks", .data = &sysctl_protected_symlinks, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "protected_hardlinks", .data = &sysctl_protected_hardlinks, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "protected_fifos", .data = &sysctl_protected_fifos, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, { .procname = "protected_regular", .data = &sysctl_protected_regular, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, }; static int __init init_fs_namei_sysctls(void) { register_sysctl_init("fs", namei_sysctls); return 0; } fs_initcall(init_fs_namei_sysctls); #endif /* CONFIG_SYSCTL */ /** * may_follow_link - Check symlink following for unsafe situations * @nd: nameidata pathwalk data * @inode: Used for idmapping. * * In the case of the sysctl_protected_symlinks sysctl being enabled, * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is * in a sticky world-writable directory. This is to protect privileged * processes from failing races against path names that may change out * from under them by way of other users creating malicious symlinks. * It will permit symlinks to be followed only when outside a sticky * world-writable directory, or when the uid of the symlink and follower * match, or when the directory owner matches the symlink's owner. * * Returns 0 if following the symlink is allowed, -ve on error. */ static inline int may_follow_link(struct nameidata *nd, const struct inode *inode) { struct mnt_idmap *idmap; vfsuid_t vfsuid; if (!sysctl_protected_symlinks) return 0; idmap = mnt_idmap(nd->path.mnt); vfsuid = i_uid_into_vfsuid(idmap, inode); /* Allowed if owner and follower match. */ if (vfsuid_eq_kuid(vfsuid, current_fsuid())) return 0; /* Allowed if parent directory not sticky and world-writable. */ if ((nd->dir_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH)) return 0; /* Allowed if parent directory and link owner match. */ if (vfsuid_valid(nd->dir_vfsuid) && vfsuid_eq(nd->dir_vfsuid, vfsuid)) return 0; if (nd->flags & LOOKUP_RCU) return -ECHILD; audit_inode(nd->name, nd->stack[0].link.dentry, 0); audit_log_path_denied(AUDIT_ANOM_LINK, "follow_link"); return -EACCES; } /** * safe_hardlink_source - Check for safe hardlink conditions * @idmap: idmap of the mount the inode was found from * @inode: the source inode to hardlink from * * Return false if at least one of the following conditions: * - inode is not a regular file * - inode is setuid * - inode is setgid and group-exec * - access failure for read and write * * Otherwise returns true. */ static bool safe_hardlink_source(struct mnt_idmap *idmap, struct inode *inode) { umode_t mode = inode->i_mode; /* Special files should not get pinned to the filesystem. */ if (!S_ISREG(mode)) return false; /* Setuid files should not get pinned to the filesystem. */ if (mode & S_ISUID) return false; /* Executable setgid files should not get pinned to the filesystem. */ if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) return false; /* Hardlinking to unreadable or unwritable sources is dangerous. */ if (inode_permission(idmap, inode, MAY_READ | MAY_WRITE)) return false; return true; } /** * may_linkat - Check permissions for creating a hardlink * @idmap: idmap of the mount the inode was found from * @link: the source to hardlink from * * Block hardlink when all of: * - sysctl_protected_hardlinks enabled * - fsuid does not match inode * - hardlink source is unsafe (see safe_hardlink_source() above) * - not CAP_FOWNER in a namespace with the inode owner uid mapped * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. * * Returns 0 if successful, -ve on error. */ int may_linkat(struct mnt_idmap *idmap, const struct path *link) { struct inode *inode = link->dentry->d_inode; /* Inode writeback is not safe when the uid or gid are invalid. */ if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) || !vfsgid_valid(i_gid_into_vfsgid(idmap, inode))) return -EOVERFLOW; if (!sysctl_protected_hardlinks) return 0; /* Source inode owner (or CAP_FOWNER) can hardlink all they like, * otherwise, it must be a safe source. */ if (safe_hardlink_source(idmap, inode) || inode_owner_or_capable(idmap, inode)) return 0; audit_log_path_denied(AUDIT_ANOM_LINK, "linkat"); return -EPERM; } /** * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory * should be allowed, or not, on files that already * exist. * @idmap: idmap of the mount the inode was found from * @nd: nameidata pathwalk data * @inode: the inode of the file to open * * Block an O_CREAT open of a FIFO (or a regular file) when: * - sysctl_protected_fifos (or sysctl_protected_regular) is enabled * - the file already exists * - we are in a sticky directory * - we don't own the file * - the owner of the directory doesn't own the file * - the directory is world writable * If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2 * the directory doesn't have to be world writable: being group writable will * be enough. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. * * Returns 0 if the open is allowed, -ve on error. */ static int may_create_in_sticky(struct mnt_idmap *idmap, struct nameidata *nd, struct inode *const inode) { umode_t dir_mode = nd->dir_mode; vfsuid_t dir_vfsuid = nd->dir_vfsuid, i_vfsuid; if (likely(!(dir_mode & S_ISVTX))) return 0; if (S_ISREG(inode->i_mode) && !sysctl_protected_regular) return 0; if (S_ISFIFO(inode->i_mode) && !sysctl_protected_fifos) return 0; i_vfsuid = i_uid_into_vfsuid(idmap, inode); if (vfsuid_eq(i_vfsuid, dir_vfsuid)) return 0; if (vfsuid_eq_kuid(i_vfsuid, current_fsuid())) return 0; if (likely(dir_mode & 0002)) { audit_log_path_denied(AUDIT_ANOM_CREAT, "sticky_create"); return -EACCES; } if (dir_mode & 0020) { if (sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) { audit_log_path_denied(AUDIT_ANOM_CREAT, "sticky_create_fifo"); return -EACCES; } if (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode)) { audit_log_path_denied(AUDIT_ANOM_CREAT, "sticky_create_regular"); return -EACCES; } } return 0; } /* * follow_up - Find the mountpoint of path's vfsmount * * Given a path, find the mountpoint of its source file system. * Replace @path with the path of the mountpoint in the parent mount. * Up is towards /. * * Return 1 if we went up a level and 0 if we were already at the * root. */ int follow_up(struct path *path) { struct mount *mnt = real_mount(path->mnt); struct mount *parent; struct dentry *mountpoint; read_seqlock_excl(&mount_lock); parent = mnt->mnt_parent; if (parent == mnt) { read_sequnlock_excl(&mount_lock); return 0; } mntget(&parent->mnt); mountpoint = dget(mnt->mnt_mountpoint); read_sequnlock_excl(&mount_lock); dput(path->dentry); path->dentry = mountpoint; mntput(path->mnt); path->mnt = &parent->mnt; return 1; } EXPORT_SYMBOL(follow_up); static bool choose_mountpoint_rcu(struct mount *m, const struct path *root, struct path *path, unsigned *seqp) { while (mnt_has_parent(m)) { struct dentry *mountpoint = m->mnt_mountpoint; m = m->mnt_parent; if (unlikely(root->dentry == mountpoint && root->mnt == &m->mnt)) break; if (mountpoint != m->mnt.mnt_root) { path->mnt = &m->mnt; path->dentry = mountpoint; *seqp = read_seqcount_begin(&mountpoint->d_seq); return true; } } return false; } static bool choose_mountpoint(struct mount *m, const struct path *root, struct path *path) { bool found; rcu_read_lock(); while (1) { unsigned seq, mseq = read_seqbegin(&mount_lock); found = choose_mountpoint_rcu(m, root, path, &seq); if (unlikely(!found)) { if (!read_seqretry(&mount_lock, mseq)) break; } else { if (likely(__legitimize_path(path, seq, mseq))) break; rcu_read_unlock(); path_put(path); rcu_read_lock(); } } rcu_read_unlock(); return found; } /* * Perform an automount * - return -EISDIR to tell follow_managed() to stop and return the path we * were called with. */ static int follow_automount(struct path *path, int *count, unsigned lookup_flags) { struct dentry *dentry = path->dentry; /* We don't want to mount if someone's just doing a stat - * unless they're stat'ing a directory and appended a '/' to * the name. * * We do, however, want to mount if someone wants to open or * create a file of any type under the mountpoint, wants to * traverse through the mountpoint or wants to open the * mounted directory. Also, autofs may mark negative dentries * as being automount points. These will need the attentions * of the daemon to instantiate them before they can be used. */ if (!(lookup_flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) && dentry->d_inode) return -EISDIR; if (count && (*count)++ >= MAXSYMLINKS) return -ELOOP; return finish_automount(dentry->d_op->d_automount(path), path); } /* * mount traversal - out-of-line part. One note on ->d_flags accesses - * dentries are pinned but not locked here, so negative dentry can go * positive right under us. Use of smp_load_acquire() provides a barrier * sufficient for ->d_inode and ->d_flags consistency. */ static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped, int *count, unsigned lookup_flags) { struct vfsmount *mnt = path->mnt; bool need_mntput = false; int ret = 0; while (flags & DCACHE_MANAGED_DENTRY) { /* Allow the filesystem to manage the transit without i_mutex * being held. */ if (flags & DCACHE_MANAGE_TRANSIT) { ret = path->dentry->d_op->d_manage(path, false); flags = smp_load_acquire(&path->dentry->d_flags); if (ret < 0) break; } if (flags & DCACHE_MOUNTED) { // something's mounted on it.. struct vfsmount *mounted = lookup_mnt(path); if (mounted) { // ... in our namespace dput(path->dentry); if (need_mntput) mntput(path->mnt); path->mnt = mounted; path->dentry = dget(mounted->mnt_root); // here we know it's positive flags = path->dentry->d_flags; need_mntput = true; continue; } } if (!(flags & DCACHE_NEED_AUTOMOUNT)) break; // uncovered automount point ret = follow_automount(path, count, lookup_flags); flags = smp_load_acquire(&path->dentry->d_flags); if (ret < 0) break; } if (ret == -EISDIR) ret = 0; // possible if you race with several mount --move if (need_mntput && path->mnt == mnt) mntput(path->mnt); if (!ret && unlikely(d_flags_negative(flags))) ret = -ENOENT; *jumped = need_mntput; return ret; } static inline int traverse_mounts(struct path *path, bool *jumped, int *count, unsigned lookup_flags) { unsigned flags = smp_load_acquire(&path->dentry->d_flags); /* fastpath */ if (likely(!(flags & DCACHE_MANAGED_DENTRY))) { *jumped = false; if (unlikely(d_flags_negative(flags))) return -ENOENT; return 0; } return __traverse_mounts(path, flags, jumped, count, lookup_flags); } int follow_down_one(struct path *path) { struct vfsmount *mounted; mounted = lookup_mnt(path); if (mounted) { dput(path->dentry); mntput(path->mnt); path->mnt = mounted; path->dentry = dget(mounted->mnt_root); return 1; } return 0; } EXPORT_SYMBOL(follow_down_one); /* * Follow down to the covering mount currently visible to userspace. At each * point, the filesystem owning that dentry may be queried as to whether the * caller is permitted to proceed or not. */ int follow_down(struct path *path, unsigned int flags) { struct vfsmount *mnt = path->mnt; bool jumped; int ret = traverse_mounts(path, &jumped, NULL, flags); if (path->mnt != mnt) mntput(mnt); return ret; } EXPORT_SYMBOL(follow_down); /* * Try to skip to top of mountpoint pile in rcuwalk mode. Fail if * we meet a managed dentry that would need blocking. */ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path) { struct dentry *dentry = path->dentry; unsigned int flags = dentry->d_flags; if (likely(!(flags & DCACHE_MANAGED_DENTRY))) return true; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) return false; for (;;) { /* * Don't forget we might have a non-mountpoint managed dentry * that wants to block transit. */ if (unlikely(flags & DCACHE_MANAGE_TRANSIT)) { int res = dentry->d_op->d_manage(path, true); if (res) return res == -EISDIR; flags = dentry->d_flags; } if (flags & DCACHE_MOUNTED) { struct mount *mounted = __lookup_mnt(path->mnt, dentry); if (mounted) { path->mnt = &mounted->mnt; dentry = path->dentry = mounted->mnt.mnt_root; nd->state |= ND_JUMPED; nd->next_seq = read_seqcount_begin(&dentry->d_seq); flags = dentry->d_flags; // makes sure that non-RCU pathwalk could reach // this state. if (read_seqretry(&mount_lock, nd->m_seq)) return false; continue; } if (read_seqretry(&mount_lock, nd->m_seq)) return false; } return !(flags & DCACHE_NEED_AUTOMOUNT); } } static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, struct path *path) { bool jumped; int ret; path->mnt = nd->path.mnt; path->dentry = dentry; if (nd->flags & LOOKUP_RCU) { unsigned int seq = nd->next_seq; if (likely(__follow_mount_rcu(nd, path))) return 0; // *path and nd->next_seq might've been clobbered path->mnt = nd->path.mnt; path->dentry = dentry; nd->next_seq = seq; if (!try_to_unlazy_next(nd, dentry)) return -ECHILD; } ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags); if (jumped) { if (unlikely(nd->flags & LOOKUP_NO_XDEV)) ret = -EXDEV; else nd->state |= ND_JUMPED; } if (unlikely(ret)) { dput(path->dentry); if (path->mnt != nd->path.mnt) mntput(path->mnt); } return ret; } /* * This looks up the name in dcache and possibly revalidates the found dentry. * NULL is returned if the dentry does not exist in the cache. */ static struct dentry *lookup_dcache(const struct qstr *name, struct dentry *dir, unsigned int flags) { struct dentry *dentry = d_lookup(dir, name); if (dentry) { int error = d_revalidate(dir->d_inode, name, dentry, flags); if (unlikely(error <= 0)) { if (!error) d_invalidate(dentry); dput(dentry); return ERR_PTR(error); } } return dentry; } static struct dentry *lookup_one_qstr_excl_raw(const struct qstr *name, struct dentry *base, unsigned int flags) { struct dentry *dentry; struct dentry *old; struct inode *dir; dentry = lookup_dcache(name, base, flags); if (dentry) return dentry; /* Don't create child dentry for a dead directory. */ dir = base->d_inode; if (unlikely(IS_DEADDIR(dir))) return ERR_PTR(-ENOENT); dentry = d_alloc(base, name); if (unlikely(!dentry)) return ERR_PTR(-ENOMEM); old = dir->i_op->lookup(dir, dentry, flags); if (unlikely(old)) { dput(dentry); dentry = old; } return dentry; } /* * Parent directory has inode locked exclusive. This is one * and only case when ->lookup() gets called on non in-lookup * dentries - as the matter of fact, this only gets called * when directory is guaranteed to have no in-lookup children * at all. * Will return -ENOENT if name isn't found and LOOKUP_CREATE wasn't passed. * Will return -EEXIST if name is found and LOOKUP_EXCL was passed. */ struct dentry *lookup_one_qstr_excl(const struct qstr *name, struct dentry *base, unsigned int flags) { struct dentry *dentry; dentry = lookup_one_qstr_excl_raw(name, base, flags); if (IS_ERR(dentry)) return dentry; if (d_is_negative(dentry) && !(flags & LOOKUP_CREATE)) { dput(dentry); return ERR_PTR(-ENOENT); } if (d_is_positive(dentry) && (flags & LOOKUP_EXCL)) { dput(dentry); return ERR_PTR(-EEXIST); } return dentry; } EXPORT_SYMBOL(lookup_one_qstr_excl); /** * lookup_fast - do fast lockless (but racy) lookup of a dentry * @nd: current nameidata * * Do a fast, but racy lookup in the dcache for the given dentry, and * revalidate it. Returns a valid dentry pointer or NULL if one wasn't * found. On error, an ERR_PTR will be returned. * * If this function returns a valid dentry and the walk is no longer * lazy, the dentry will carry a reference that must later be put. If * RCU mode is still in force, then this is not the case and the dentry * must be legitimized before use. If this returns NULL, then the walk * will no longer be in RCU mode. */ static struct dentry *lookup_fast(struct nameidata *nd) { struct dentry *dentry, *parent = nd->path.dentry; int status = 1; /* * Rename seqlock is not required here because in the off chance * of a false negative due to a concurrent rename, the caller is * going to fall back to non-racy lookup. */ if (nd->flags & LOOKUP_RCU) { dentry = __d_lookup_rcu(parent, &nd->last, &nd->next_seq); if (unlikely(!dentry)) { if (!try_to_unlazy(nd)) return ERR_PTR(-ECHILD); return NULL; } /* * This sequence count validates that the parent had no * changes while we did the lookup of the dentry above. */ if (read_seqcount_retry(&parent->d_seq, nd->seq)) return ERR_PTR(-ECHILD); status = d_revalidate(nd->inode, &nd->last, dentry, nd->flags); if (likely(status > 0)) return dentry; if (!try_to_unlazy_next(nd, dentry)) return ERR_PTR(-ECHILD); if (status == -ECHILD) /* we'd been told to redo it in non-rcu mode */ status = d_revalidate(nd->inode, &nd->last, dentry, nd->flags); } else { dentry = __d_lookup(parent, &nd->last); if (unlikely(!dentry)) return NULL; status = d_revalidate(nd->inode, &nd->last, dentry, nd->flags); } if (unlikely(status <= 0)) { if (!status) d_invalidate(dentry); dput(dentry); return ERR_PTR(status); } return dentry; } /* Fast lookup failed, do it the slow way */ static struct dentry *__lookup_slow(const struct qstr *name, struct dentry *dir, unsigned int flags) { struct dentry *dentry, *old; struct inode *inode = dir->d_inode; DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); /* Don't go there if it's already dead */ if (unlikely(IS_DEADDIR(inode))) return ERR_PTR(-ENOENT); again: dentry = d_alloc_parallel(dir, name, &wq); if (IS_ERR(dentry)) return dentry; if (unlikely(!d_in_lookup(dentry))) { int error = d_revalidate(inode, name, dentry, flags); if (unlikely(error <= 0)) { if (!error) { d_invalidate(dentry); dput(dentry); goto again; } dput(dentry); dentry = ERR_PTR(error); } } else { old = inode->i_op->lookup(inode, dentry, flags); d_lookup_done(dentry); if (unlikely(old)) { dput(dentry); dentry = old; } } return dentry; } static struct dentry *lookup_slow(const struct qstr *name, struct dentry *dir, unsigned int flags) { struct inode *inode = dir->d_inode; struct dentry *res; inode_lock_shared(inode); res = __lookup_slow(name, dir, flags); inode_unlock_shared(inode); return res; } static inline int may_lookup(struct mnt_idmap *idmap, struct nameidata *restrict nd) { int err, mask; mask = nd->flags & LOOKUP_RCU ? MAY_NOT_BLOCK : 0; err = inode_permission(idmap, nd->inode, mask | MAY_EXEC); if (likely(!err)) return 0; // If we failed, and we weren't in LOOKUP_RCU, it's final if (!(nd->flags & LOOKUP_RCU)) return err; // Drop out of RCU mode to make sure it wasn't transient if (!try_to_unlazy(nd)) return -ECHILD; // redo it all non-lazy if (err != -ECHILD) // hard error return err; return inode_permission(idmap, nd->inode, MAY_EXEC); } static int reserve_stack(struct nameidata *nd, struct path *link) { if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) return -ELOOP; if (likely(nd->depth != EMBEDDED_LEVELS)) return 0; if (likely(nd->stack != nd->internal)) return 0; if (likely(nd_alloc_stack(nd))) return 0; if (nd->flags & LOOKUP_RCU) { // we need to grab link before we do unlazy. And we can't skip // unlazy even if we fail to grab the link - cleanup needs it bool grabbed_link = legitimize_path(nd, link, nd->next_seq); if (!try_to_unlazy(nd) || !grabbed_link) return -ECHILD; if (nd_alloc_stack(nd)) return 0; } return -ENOMEM; } enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4}; static const char *pick_link(struct nameidata *nd, struct path *link, struct inode *inode, int flags) { struct saved *last; const char *res; int error = reserve_stack(nd, link); if (unlikely(error)) { if (!(nd->flags & LOOKUP_RCU)) path_put(link); return ERR_PTR(error); } last = nd->stack + nd->depth++; last->link = *link; clear_delayed_call(&last->done); last->seq = nd->next_seq; if (flags & WALK_TRAILING) { error = may_follow_link(nd, inode); if (unlikely(error)) return ERR_PTR(error); } if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS) || unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW)) return ERR_PTR(-ELOOP); if (!(nd->flags & LOOKUP_RCU)) { touch_atime(&last->link); cond_resched(); } else if (atime_needs_update(&last->link, inode)) { if (!try_to_unlazy(nd)) return ERR_PTR(-ECHILD); touch_atime(&last->link); } error = security_inode_follow_link(link->dentry, inode, nd->flags & LOOKUP_RCU); if (unlikely(error)) return ERR_PTR(error); res = READ_ONCE(inode->i_link); if (!res) { const char * (*get)(struct dentry *, struct inode *, struct delayed_call *); get = inode->i_op->get_link; if (nd->flags & LOOKUP_RCU) { res = get(NULL, inode, &last->done); if (res == ERR_PTR(-ECHILD) && try_to_unlazy(nd)) res = get(link->dentry, inode, &last->done); } else { res = get(link->dentry, inode, &last->done); } if (!res) goto all_done; if (IS_ERR(res)) return res; } if (*res == '/') { error = nd_jump_root(nd); if (unlikely(error)) return ERR_PTR(error); while (unlikely(*++res == '/')) ; } if (*res) return res; all_done: // pure jump put_link(nd); return NULL; } /* * Do we need to follow links? We _really_ want to be able * to do this check without having to look at inode->i_op, * so we keep a cache of "no, this doesn't need follow_link" * for the common case. * * NOTE: dentry must be what nd->next_seq had been sampled from. */ static const char *step_into(struct nameidata *nd, int flags, struct dentry *dentry) { struct path path; struct inode *inode; int err = handle_mounts(nd, dentry, &path); if (err < 0) return ERR_PTR(err); inode = path.dentry->d_inode; if (likely(!d_is_symlink(path.dentry)) || ((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) || (flags & WALK_NOFOLLOW)) { /* not a symlink or should not follow */ if (nd->flags & LOOKUP_RCU) { if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq)) return ERR_PTR(-ECHILD); if (unlikely(!inode)) return ERR_PTR(-ENOENT); } else { dput(nd->path.dentry); if (nd->path.mnt != path.mnt) mntput(nd->path.mnt); } nd->path = path; nd->inode = inode; nd->seq = nd->next_seq; return NULL; } if (nd->flags & LOOKUP_RCU) { /* make sure that d_is_symlink above matches inode */ if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq)) return ERR_PTR(-ECHILD); } else { if (path.mnt == nd->path.mnt) mntget(path.mnt); } return pick_link(nd, &path, inode, flags); } static struct dentry *follow_dotdot_rcu(struct nameidata *nd) { struct dentry *parent, *old; if (path_equal(&nd->path, &nd->root)) goto in_root; if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) { struct path path; unsigned seq; if (!choose_mountpoint_rcu(real_mount(nd->path.mnt), &nd->root, &path, &seq)) goto in_root; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) return ERR_PTR(-ECHILD); nd->path = path; nd->inode = path.dentry->d_inode; nd->seq = seq; // makes sure that non-RCU pathwalk could reach this state if (read_seqretry(&mount_lock, nd->m_seq)) return ERR_PTR(-ECHILD); /* we know that mountpoint was pinned */ } old = nd->path.dentry; parent = old->d_parent; nd->next_seq = read_seqcount_begin(&parent->d_seq); // makes sure that non-RCU pathwalk could reach this state if (read_seqcount_retry(&old->d_seq, nd->seq)) return ERR_PTR(-ECHILD); if (unlikely(!path_connected(nd->path.mnt, parent))) return ERR_PTR(-ECHILD); return parent; in_root: if (read_seqretry(&mount_lock, nd->m_seq)) return ERR_PTR(-ECHILD); if (unlikely(nd->flags & LOOKUP_BENEATH)) return ERR_PTR(-ECHILD); nd->next_seq = nd->seq; return nd->path.dentry; } static struct dentry *follow_dotdot(struct nameidata *nd) { struct dentry *parent; if (path_equal(&nd->path, &nd->root)) goto in_root; if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) { struct path path; if (!choose_mountpoint(real_mount(nd->path.mnt), &nd->root, &path)) goto in_root; path_put(&nd->path); nd->path = path; nd->inode = path.dentry->d_inode; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) return ERR_PTR(-EXDEV); } /* rare case of legitimate dget_parent()... */ parent = dget_parent(nd->path.dentry); if (unlikely(!path_connected(nd->path.mnt, parent))) { dput(parent); return ERR_PTR(-ENOENT); } return parent; in_root: if (unlikely(nd->flags & LOOKUP_BENEATH)) return ERR_PTR(-EXDEV); return dget(nd->path.dentry); } static const char *handle_dots(struct nameidata *nd, int type) { if (type == LAST_DOTDOT) { const char *error = NULL; struct dentry *parent; if (!nd->root.mnt) { error = ERR_PTR(set_root(nd)); if (error) return error; } if (nd->flags & LOOKUP_RCU) parent = follow_dotdot_rcu(nd); else parent = follow_dotdot(nd); if (IS_ERR(parent)) return ERR_CAST(parent); error = step_into(nd, WALK_NOFOLLOW, parent); if (unlikely(error)) return error; if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) { /* * If there was a racing rename or mount along our * path, then we can't be sure that ".." hasn't jumped * above nd->root (and so userspace should retry or use * some fallback). */ smp_rmb(); if (__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq)) return ERR_PTR(-EAGAIN); if (__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq)) return ERR_PTR(-EAGAIN); } } return NULL; } static const char *walk_component(struct nameidata *nd, int flags) { struct dentry *dentry; /* * "." and ".." are special - ".." especially so because it has * to be able to know about the current root directory and * parent relationships. */ if (unlikely(nd->last_type != LAST_NORM)) { if (!(flags & WALK_MORE) && nd->depth) put_link(nd); return handle_dots(nd, nd->last_type); } dentry = lookup_fast(nd); if (IS_ERR(dentry)) return ERR_CAST(dentry); if (unlikely(!dentry)) { dentry = lookup_slow(&nd->last, nd->path.dentry, nd->flags); if (IS_ERR(dentry)) return ERR_CAST(dentry); } if (!(flags & WALK_MORE) && nd->depth) put_link(nd); return step_into(nd, flags, dentry); } /* * We can do the critical dentry name comparison and hashing * operations one word at a time, but we are limited to: * * - Architectures with fast unaligned word accesses. We could * do a "get_unaligned()" if this helps and is sufficiently * fast. * * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we * do not trap on the (extremely unlikely) case of a page * crossing operation. * * - Furthermore, we need an efficient 64-bit compile for the * 64-bit case in order to generate the "number of bytes in * the final mask". Again, that could be replaced with a * efficient population count instruction or similar. */ #ifdef CONFIG_DCACHE_WORD_ACCESS #include <asm/word-at-a-time.h> #ifdef HASH_MIX /* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */ #elif defined(CONFIG_64BIT) /* * Register pressure in the mixing function is an issue, particularly * on 32-bit x86, but almost any function requires one state value and * one temporary. Instead, use a function designed for two state values * and no temporaries. * * This function cannot create a collision in only two iterations, so * we have two iterations to achieve avalanche. In those two iterations, * we have six layers of mixing, which is enough to spread one bit's * influence out to 2^6 = 64 state bits. * * Rotate constants are scored by considering either 64 one-bit input * deltas or 64*63/2 = 2016 two-bit input deltas, and finding the * probability of that delta causing a change to each of the 128 output * bits, using a sample of random initial states. * * The Shannon entropy of the computed probabilities is then summed * to produce a score. Ideally, any input change has a 50% chance of * toggling any given output bit. * * Mixing scores (in bits) for (12,45): * Input delta: 1-bit 2-bit * 1 round: 713.3 42542.6 * 2 rounds: 2753.7 140389.8 * 3 rounds: 5954.1 233458.2 * 4 rounds: 7862.6 256672.2 * Perfect: 8192 258048 * (64*128) (64*63/2 * 128) */ #define HASH_MIX(x, y, a) \ ( x ^= (a), \ y ^= x, x = rol64(x,12),\ x += y, y = rol64(y,45),\ y *= 9 ) /* * Fold two longs into one 32-bit hash value. This must be fast, but * latency isn't quite as critical, as there is a fair bit of additional * work done before the hash value is used. */ static inline unsigned int fold_hash(unsigned long x, unsigned long y) { y ^= x * GOLDEN_RATIO_64; y *= GOLDEN_RATIO_64; return y >> 32; } #else /* 32-bit case */ /* * Mixing scores (in bits) for (7,20): * Input delta: 1-bit 2-bit * 1 round: 330.3 9201.6 * 2 rounds: 1246.4 25475.4 * 3 rounds: 1907.1 31295.1 * 4 rounds: 2042.3 31718.6 * Perfect: 2048 31744 * (32*64) (32*31/2 * 64) */ #define HASH_MIX(x, y, a) \ ( x ^= (a), \ y ^= x, x = rol32(x, 7),\ x += y, y = rol32(y,20),\ y *= 9 ) static inline unsigned int fold_hash(unsigned long x, unsigned long y) { /* Use arch-optimized multiply if one exists */ return __hash_32(y ^ __hash_32(x)); } #endif /* * Return the hash of a string of known length. This is carfully * designed to match hash_name(), which is the more critical function. * In particular, we must end by hashing a final word containing 0..7 * payload bytes, to match the way that hash_name() iterates until it * finds the delimiter after the name. */ unsigned int full_name_hash(const void *salt, const char *name, unsigned int len) { unsigned long a, x = 0, y = (unsigned long)salt; for (;;) { if (!len) goto done; a = load_unaligned_zeropad(name); if (len < sizeof(unsigned long)) break; HASH_MIX(x, y, a); name += sizeof(unsigned long); len -= sizeof(unsigned long); } x ^= a & bytemask_from_count(len); done: return fold_hash(x, y); } EXPORT_SYMBOL(full_name_hash); /* Return the "hash_len" (hash and length) of a null-terminated string */ u64 hashlen_string(const void *salt, const char *name) { unsigned long a = 0, x = 0, y = (unsigned long)salt; unsigned long adata, mask, len; const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; len = 0; goto inside; do { HASH_MIX(x, y, a); len += sizeof(unsigned long); inside: a = load_unaligned_zeropad(name+len); } while (!has_zero(a, &adata, &constants)); adata = prep_zero_mask(a, adata, &constants); mask = create_zero_mask(adata); x ^= a & zero_bytemask(mask); return hashlen_create(fold_hash(x, y), len + find_zero(mask)); } EXPORT_SYMBOL(hashlen_string); /* * Calculate the length and hash of the path component, and * return the length as the result. */ static inline const char *hash_name(struct nameidata *nd, const char *name, unsigned long *lastword) { unsigned long a, b, x, y = (unsigned long)nd->path.dentry; unsigned long adata, bdata, mask, len; const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; /* * The first iteration is special, because it can result in * '.' and '..' and has no mixing other than the final fold. */ a = load_unaligned_zeropad(name); b = a ^ REPEAT_BYTE('/'); if (has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)) { adata = prep_zero_mask(a, adata, &constants); bdata = prep_zero_mask(b, bdata, &constants); mask = create_zero_mask(adata | bdata); a &= zero_bytemask(mask); *lastword = a; len = find_zero(mask); nd->last.hash = fold_hash(a, y); nd->last.len = len; return name + len; } len = 0; x = 0; do { HASH_MIX(x, y, a); len += sizeof(unsigned long); a = load_unaligned_zeropad(name+len); b = a ^ REPEAT_BYTE('/'); } while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants))); adata = prep_zero_mask(a, adata, &constants); bdata = prep_zero_mask(b, bdata, &constants); mask = create_zero_mask(adata | bdata); a &= zero_bytemask(mask); x ^= a; len += find_zero(mask); *lastword = 0; // Multi-word components cannot be DOT or DOTDOT nd->last.hash = fold_hash(x, y); nd->last.len = len; return name + len; } /* * Note that the 'last' word is always zero-masked, but * was loaded as a possibly big-endian word. */ #ifdef __BIG_ENDIAN #define LAST_WORD_IS_DOT (0x2eul << (BITS_PER_LONG-8)) #define LAST_WORD_IS_DOTDOT (0x2e2eul << (BITS_PER_LONG-16)) #endif #else /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */ /* Return the hash of a string of known length */ unsigned int full_name_hash(const void *salt, const char *name, unsigned int len) { unsigned long hash = init_name_hash(salt); while (len--) hash = partial_name_hash((unsigned char)*name++, hash); return end_name_hash(hash); } EXPORT_SYMBOL(full_name_hash); /* Return the "hash_len" (hash and length) of a null-terminated string */ u64 hashlen_string(const void *salt, const char *name) { unsigned long hash = init_name_hash(salt); unsigned long len = 0, c; c = (unsigned char)*name; while (c) { len++; hash = partial_name_hash(c, hash); c = (unsigned char)name[len]; } return hashlen_create(end_name_hash(hash), len); } EXPORT_SYMBOL(hashlen_string); /* * We know there's a real path component here of at least * one character. */ static inline const char *hash_name(struct nameidata *nd, const char *name, unsigned long *lastword) { unsigned long hash = init_name_hash(nd->path.dentry); unsigned long len = 0, c, last = 0; c = (unsigned char)*name; do { last = (last << 8) + c; len++; hash = partial_name_hash(c, hash); c = (unsigned char)name[len]; } while (c && c != '/'); // This is reliable for DOT or DOTDOT, since the component // cannot contain NUL characters - top bits being zero means // we cannot have had any other pathnames. *lastword = last; nd->last.hash = end_name_hash(hash); nd->last.len = len; return name + len; } #endif #ifndef LAST_WORD_IS_DOT #define LAST_WORD_IS_DOT 0x2e #define LAST_WORD_IS_DOTDOT 0x2e2e #endif /* * Name resolution. * This is the basic name resolution function, turning a pathname into * the final dentry. We expect 'base' to be positive and a directory. * * Returns 0 and nd will have valid dentry and mnt on success. * Returns error and drops reference to input namei data on failure. */ static int link_path_walk(const char *name, struct nameidata *nd) { int depth = 0; // depth <= nd->depth int err; nd->last_type = LAST_ROOT; nd->flags |= LOOKUP_PARENT; if (IS_ERR(name)) return PTR_ERR(name); while (*name=='/') name++; if (!*name) { nd->dir_mode = 0; // short-circuit the 'hardening' idiocy return 0; } /* At this point we know we have a real path component. */ for(;;) { struct mnt_idmap *idmap; const char *link; unsigned long lastword; idmap = mnt_idmap(nd->path.mnt); err = may_lookup(idmap, nd); if (err) return err; nd->last.name = name; name = hash_name(nd, name, &lastword); switch(lastword) { case LAST_WORD_IS_DOTDOT: nd->last_type = LAST_DOTDOT; nd->state |= ND_JUMPED; break; case LAST_WORD_IS_DOT: nd->last_type = LAST_DOT; break; default: nd->last_type = LAST_NORM; nd->state &= ~ND_JUMPED; struct dentry *parent = nd->path.dentry; if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { err = parent->d_op->d_hash(parent, &nd->last); if (err < 0) return err; } } if (!*name) goto OK; /* * If it wasn't NUL, we know it was '/'. Skip that * slash, and continue until no more slashes. */ do { name++; } while (unlikely(*name == '/')); if (unlikely(!*name)) { OK: /* pathname or trailing symlink, done */ if (!depth) { nd->dir_vfsuid = i_uid_into_vfsuid(idmap, nd->inode); nd->dir_mode = nd->inode->i_mode; nd->flags &= ~LOOKUP_PARENT; return 0; } /* last component of nested symlink */ name = nd->stack[--depth].name; link = walk_component(nd, 0); } else { /* not the last component */ link = walk_component(nd, WALK_MORE); } if (unlikely(link)) { if (IS_ERR(link)) return PTR_ERR(link); /* a symlink to follow */ nd->stack[depth++].name = name; name = link; continue; } if (unlikely(!d_can_lookup(nd->path.dentry))) { if (nd->flags & LOOKUP_RCU) { if (!try_to_unlazy(nd)) return -ECHILD; } return -ENOTDIR; } } } /* must be paired with terminate_walk() */ static const char *path_init(struct nameidata *nd, unsigned flags) { int error; const char *s = nd->pathname; /* LOOKUP_CACHED requires RCU, ask caller to retry */ if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED) return ERR_PTR(-EAGAIN); if (!*s) flags &= ~LOOKUP_RCU; if (flags & LOOKUP_RCU) rcu_read_lock(); else nd->seq = nd->next_seq = 0; nd->flags = flags; nd->state |= ND_JUMPED; nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount); nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount); smp_rmb(); if (nd->state & ND_ROOT_PRESET) { struct dentry *root = nd->root.dentry; struct inode *inode = root->d_inode; if (*s && unlikely(!d_can_lookup(root))) return ERR_PTR(-ENOTDIR); nd->path = nd->root; nd->inode = inode; if (flags & LOOKUP_RCU) { nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); nd->root_seq = nd->seq; } else { path_get(&nd->path); } return s; } nd->root.mnt = NULL; /* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */ if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) { error = nd_jump_root(nd); if (unlikely(error)) return ERR_PTR(error); return s; } /* Relative pathname -- get the starting-point it is relative to. */ if (nd->dfd == AT_FDCWD) { if (flags & LOOKUP_RCU) { struct fs_struct *fs = current->fs; unsigned seq; do { seq = read_seqcount_begin(&fs->seq); nd->path = fs->pwd; nd->inode = nd->path.dentry->d_inode; nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); } while (read_seqcount_retry(&fs->seq, seq)); } else { get_fs_pwd(current->fs, &nd->path); nd->inode = nd->path.dentry->d_inode; } } else { /* Caller must check execute permissions on the starting path component */ CLASS(fd_raw, f)(nd->dfd); struct dentry *dentry; if (fd_empty(f)) return ERR_PTR(-EBADF); if (flags & LOOKUP_LINKAT_EMPTY) { if (fd_file(f)->f_cred != current_cred() && !ns_capable(fd_file(f)->f_cred->user_ns, CAP_DAC_READ_SEARCH)) return ERR_PTR(-ENOENT); } dentry = fd_file(f)->f_path.dentry; if (*s && unlikely(!d_can_lookup(dentry))) return ERR_PTR(-ENOTDIR); nd->path = fd_file(f)->f_path; if (flags & LOOKUP_RCU) { nd->inode = nd->path.dentry->d_inode; nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); } else { path_get(&nd->path); nd->inode = nd->path.dentry->d_inode; } } /* For scoped-lookups we need to set the root to the dirfd as well. */ if (flags & LOOKUP_IS_SCOPED) { nd->root = nd->path; if (flags & LOOKUP_RCU) { nd->root_seq = nd->seq; } else { path_get(&nd->root); nd->state |= ND_ROOT_GRABBED; } } return s; } static inline const char *lookup_last(struct nameidata *nd) { if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len]) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; return walk_component(nd, WALK_TRAILING); } static int handle_lookup_down(struct nameidata *nd) { if (!(nd->flags & LOOKUP_RCU)) dget(nd->path.dentry); nd->next_seq = nd->seq; return PTR_ERR(step_into(nd, WALK_NOFOLLOW, nd->path.dentry)); } /* Returns 0 and nd will be valid on success; Returns error, otherwise. */ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path) { const char *s = path_init(nd, flags); int err; if (unlikely(flags & LOOKUP_DOWN) && !IS_ERR(s)) { err = handle_lookup_down(nd); if (unlikely(err < 0)) s = ERR_PTR(err); } while (!(err = link_path_walk(s, nd)) && (s = lookup_last(nd)) != NULL) ; if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) { err = handle_lookup_down(nd); nd->state &= ~ND_JUMPED; // no d_weak_revalidate(), please... } if (!err) err = complete_walk(nd); if (!err && nd->flags & LOOKUP_DIRECTORY) if (!d_can_lookup(nd->path.dentry)) err = -ENOTDIR; if (!err) { *path = nd->path; nd->path.mnt = NULL; nd->path.dentry = NULL; } terminate_walk(nd); return err; } int filename_lookup(int dfd, struct filename *name, unsigned flags, struct path *path, struct path *root) { int retval; struct nameidata nd; if (IS_ERR(name)) return PTR_ERR(name); set_nameidata(&nd, dfd, name, root); retval = path_lookupat(&nd, flags | LOOKUP_RCU, path); if (unlikely(retval == -ECHILD)) retval = path_lookupat(&nd, flags, path); if (unlikely(retval == -ESTALE)) retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path); if (likely(!retval)) audit_inode(name, path->dentry, flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : 0); restore_nameidata(); return retval; } /* Returns 0 and nd will be valid on success; Returns error, otherwise. */ static int path_parentat(struct nameidata *nd, unsigned flags, struct path *parent) { const char *s = path_init(nd, flags); int err = link_path_walk(s, nd); if (!err) err = complete_walk(nd); if (!err) { *parent = nd->path; nd->path.mnt = NULL; nd->path.dentry = NULL; } terminate_walk(nd); return err; } /* Note: this does not consume "name" */ static int __filename_parentat(int dfd, struct filename *name, unsigned int flags, struct path *parent, struct qstr *last, int *type, const struct path *root) { int retval; struct nameidata nd; if (IS_ERR(name)) return PTR_ERR(name); set_nameidata(&nd, dfd, name, root); retval = path_parentat(&nd, flags | LOOKUP_RCU, parent); if (unlikely(retval == -ECHILD)) retval = path_parentat(&nd, flags, parent); if (unlikely(retval == -ESTALE)) retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent); if (likely(!retval)) { *last = nd.last; *type = nd.last_type; audit_inode(name, parent->dentry, AUDIT_INODE_PARENT); } restore_nameidata(); return retval; } static int filename_parentat(int dfd, struct filename *name, unsigned int flags, struct path *parent, struct qstr *last, int *type) { return __filename_parentat(dfd, name, flags, parent, last, type, NULL); } /* does lookup, returns the object with parent locked */ static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct path *path) { struct path parent_path __free(path_put) = {}; struct dentry *d; struct qstr last; int type, error; error = filename_parentat(dfd, name, 0, &parent_path, &last, &type); if (error) return ERR_PTR(error); if (unlikely(type != LAST_NORM)) return ERR_PTR(-EINVAL); inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT); d = lookup_one_qstr_excl(&last, parent_path.dentry, 0); if (IS_ERR(d)) { inode_unlock(parent_path.dentry->d_inode); return d; } path->dentry = no_free_ptr(parent_path.dentry); path->mnt = no_free_ptr(parent_path.mnt); return d; } struct dentry *kern_path_locked_negative(const char *name, struct path *path) { struct path parent_path __free(path_put) = {}; struct filename *filename __free(putname) = getname_kernel(name); struct dentry *d; struct qstr last; int type, error; error = filename_parentat(AT_FDCWD, filename, 0, &parent_path, &last, &type); if (error) return ERR_PTR(error); if (unlikely(type != LAST_NORM)) return ERR_PTR(-EINVAL); inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT); d = lookup_one_qstr_excl_raw(&last, parent_path.dentry, 0); if (IS_ERR(d)) { inode_unlock(parent_path.dentry->d_inode); return d; } path->dentry = no_free_ptr(parent_path.dentry); path->mnt = no_free_ptr(parent_path.mnt); return d; } struct dentry *kern_path_locked(const char *name, struct path *path) { struct filename *filename = getname_kernel(name); struct dentry *res = __kern_path_locked(AT_FDCWD, filename, path); putname(filename); return res; } struct dentry *user_path_locked_at(int dfd, const char __user *name, struct path *path) { struct filename *filename = getname(name); struct dentry *res = __kern_path_locked(dfd, filename, path); putname(filename); return res; } EXPORT_SYMBOL(user_path_locked_at); int kern_path(const char *name, unsigned int flags, struct path *path) { struct filename *filename = getname_kernel(name); int ret = filename_lookup(AT_FDCWD, filename, flags, path, NULL); putname(filename); return ret; } EXPORT_SYMBOL(kern_path); /** * vfs_path_parent_lookup - lookup a parent path relative to a dentry-vfsmount pair * @filename: filename structure * @flags: lookup flags * @parent: pointer to struct path to fill * @last: last component * @type: type of the last component * @root: pointer to struct path of the base directory */ int vfs_path_parent_lookup(struct filename *filename, unsigned int flags, struct path *parent, struct qstr *last, int *type, const struct path *root) { return __filename_parentat(AT_FDCWD, filename, flags, parent, last, type, root); } EXPORT_SYMBOL(vfs_path_parent_lookup); /** * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair * @dentry: pointer to dentry of the base directory * @mnt: pointer to vfs mount of the base directory * @name: pointer to file name * @flags: lookup flags * @path: pointer to struct path to fill */ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, const char *name, unsigned int flags, struct path *path) { struct filename *filename; struct path root = {.mnt = mnt, .dentry = dentry}; int ret; filename = getname_kernel(name); /* the first argument of filename_lookup() is ignored with root */ ret = filename_lookup(AT_FDCWD, filename, flags, path, &root); putname(filename); return ret; } EXPORT_SYMBOL(vfs_path_lookup); static int lookup_one_common(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len, struct qstr *this) { this->name = name; this->len = len; this->hash = full_name_hash(base, name, len); if (!len) return -EACCES; if (is_dot_dotdot(name, len)) return -EACCES; while (len--) { unsigned int c = *(const unsigned char *)name++; if (c == '/' || c == '\0') return -EACCES; } /* * See if the low-level filesystem might want * to use its own hash.. */ if (base->d_flags & DCACHE_OP_HASH) { int err = base->d_op->d_hash(base, this); if (err < 0) return err; } return inode_permission(idmap, base->d_inode, MAY_EXEC); } /** * try_lookup_one_len - filesystem helper to lookup single pathname component * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to * * Look up a dentry by name in the dcache, returning NULL if it does not * currently exist. The function does not try to create a dentry. * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * * No locks need be held - only a counted reference to @base is needed. * */ struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len) { struct qstr this; int err; err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this); if (err) return ERR_PTR(err); return lookup_dcache(&this, base, 0); } EXPORT_SYMBOL(try_lookup_one_len); /** * lookup_one_len - filesystem helper to lookup single pathname component * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * * The caller must hold base->i_mutex. */ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) { struct dentry *dentry; struct qstr this; int err; WARN_ON_ONCE(!inode_is_locked(base->d_inode)); err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this); if (err) return ERR_PTR(err); dentry = lookup_dcache(&this, base, 0); return dentry ? dentry : __lookup_slow(&this, base, 0); } EXPORT_SYMBOL(lookup_one_len); /** * lookup_one - filesystem helper to lookup single pathname component * @idmap: idmap of the mount the lookup is performed from * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * * The caller must hold base->i_mutex. */ struct dentry *lookup_one(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len) { struct dentry *dentry; struct qstr this; int err; WARN_ON_ONCE(!inode_is_locked(base->d_inode)); err = lookup_one_common(idmap, name, base, len, &this); if (err) return ERR_PTR(err); dentry = lookup_dcache(&this, base, 0); return dentry ? dentry : __lookup_slow(&this, base, 0); } EXPORT_SYMBOL(lookup_one); /** * lookup_one_unlocked - filesystem helper to lookup single pathname component * @idmap: idmap of the mount the lookup is performed from * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * * Unlike lookup_one_len, it should be called without the parent * i_mutex held, and will take the i_mutex itself if necessary. */ struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len) { struct qstr this; int err; struct dentry *ret; err = lookup_one_common(idmap, name, base, len, &this); if (err) return ERR_PTR(err); ret = lookup_dcache(&this, base, 0); if (!ret) ret = lookup_slow(&this, base, 0); return ret; } EXPORT_SYMBOL(lookup_one_unlocked); /** * lookup_one_positive_unlocked - filesystem helper to lookup single * pathname component * @idmap: idmap of the mount the lookup is performed from * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to * * This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns * known positive or ERR_PTR(). This is what most of the users want. * * Note that pinned negative with unlocked parent _can_ become positive at any * time, so callers of lookup_one_unlocked() need to be very careful; pinned * positives have >d_inode stable, so this one avoids such problems. * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * * The helper should be called without i_mutex held. */ struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len) { struct dentry *ret = lookup_one_unlocked(idmap, name, base, len); if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { dput(ret); ret = ERR_PTR(-ENOENT); } return ret; } EXPORT_SYMBOL(lookup_one_positive_unlocked); /** * lookup_one_len_unlocked - filesystem helper to lookup single pathname component * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * * Unlike lookup_one_len, it should be called without the parent * i_mutex held, and will take the i_mutex itself if necessary. */ struct dentry *lookup_one_len_unlocked(const char *name, struct dentry *base, int len) { return lookup_one_unlocked(&nop_mnt_idmap, name, base, len); } EXPORT_SYMBOL(lookup_one_len_unlocked); /* * Like lookup_one_len_unlocked(), except that it yields ERR_PTR(-ENOENT) * on negatives. Returns known positive or ERR_PTR(); that's what * most of the users want. Note that pinned negative with unlocked parent * _can_ become positive at any time, so callers of lookup_one_len_unlocked() * need to be very careful; pinned positives have ->d_inode stable, so * this one avoids such problems. */ struct dentry *lookup_positive_unlocked(const char *name, struct dentry *base, int len) { return lookup_one_positive_unlocked(&nop_mnt_idmap, name, base, len); } EXPORT_SYMBOL(lookup_positive_unlocked); #ifdef CONFIG_UNIX98_PTYS int path_pts(struct path *path) { /* Find something mounted on "pts" in the same directory as * the input path. */ struct dentry *parent = dget_parent(path->dentry); struct dentry *child; struct qstr this = QSTR_INIT("pts", 3); if (unlikely(!path_connected(path->mnt, parent))) { dput(parent); return -ENOENT; } dput(path->dentry); path->dentry = parent; child = d_hash_and_lookup(parent, &this); if (IS_ERR_OR_NULL(child)) return -ENOENT; path->dentry = child; dput(parent); follow_down(path, 0); return 0; } #endif int user_path_at(int dfd, const char __user *name, unsigned flags, struct path *path) { struct filename *filename = getname_flags(name, flags); int ret = filename_lookup(dfd, filename, flags, path, NULL); putname(filename); return ret; } EXPORT_SYMBOL(user_path_at); int __check_sticky(struct mnt_idmap *idmap, struct inode *dir, struct inode *inode) { kuid_t fsuid = current_fsuid(); if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), fsuid)) return 0; if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, dir), fsuid)) return 0; return !capable_wrt_inode_uidgid(idmap, inode, CAP_FOWNER); } EXPORT_SYMBOL(__check_sticky); /* * Check whether we can remove a link victim from directory dir, check * whether the type of victim is right. * 1. We can't do it if dir is read-only (done in permission()) * 2. We should have write and exec permissions on dir * 3. We can't remove anything from append-only dir * 4. We can't do anything with immutable dir (done in permission()) * 5. If the sticky bit on dir is set we should either * a. be owner of dir, or * b. be owner of victim, or * c. have CAP_FOWNER capability * 6. If the victim is append-only or immutable we can't do antyhing with * links pointing to it. * 7. If the victim has an unknown uid or gid we can't change the inode. * 8. If we were asked to remove a directory and victim isn't one - ENOTDIR. * 9. If we were asked to remove a non-directory and victim isn't one - EISDIR. * 10. We can't remove a root or mountpoint. * 11. We don't allow removal of NFS sillyrenamed files; it's handled by * nfs_async_unlink(). */ static int may_delete(struct mnt_idmap *idmap, struct inode *dir, struct dentry *victim, bool isdir) { struct inode *inode = d_backing_inode(victim); int error; if (d_is_negative(victim)) return -ENOENT; BUG_ON(!inode); BUG_ON(victim->d_parent->d_inode != dir); /* Inode writeback is not safe when the uid or gid are invalid. */ if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) || !vfsgid_valid(i_gid_into_vfsgid(idmap, inode))) return -EOVERFLOW; audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); if (error) return error; if (IS_APPEND(dir)) return -EPERM; if (check_sticky(idmap, dir, inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || HAS_UNMAPPED_ID(idmap, inode)) return -EPERM; if (isdir) { if (!d_is_dir(victim)) return -ENOTDIR; if (IS_ROOT(victim)) return -EBUSY; } else if (d_is_dir(victim)) return -EISDIR; if (IS_DEADDIR(dir)) return -ENOENT; if (victim->d_flags & DCACHE_NFSFS_RENAMED) return -EBUSY; return 0; } /* Check whether we can create an object with dentry child in directory * dir. * 1. We can't do it if child already exists (open has special treatment for * this case, but since we are inlined it's OK) * 2. We can't do it if dir is read-only (done in permission()) * 3. We can't do it if the fs can't represent the fsuid or fsgid. * 4. We should have write and exec permissions on dir * 5. We can't do it if dir is immutable (done in permission()) */ static inline int may_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *child) { audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE); if (child->d_inode) return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; if (!fsuidgid_has_mapping(dir->i_sb, idmap)) return -EOVERFLOW; return inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); } // p1 != p2, both are on the same filesystem, ->s_vfs_rename_mutex is held static struct dentry *lock_two_directories(struct dentry *p1, struct dentry *p2) { struct dentry *p = p1, *q = p2, *r; while ((r = p->d_parent) != p2 && r != p) p = r; if (r == p2) { // p is a child of p2 and an ancestor of p1 or p1 itself inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); inode_lock_nested(p1->d_inode, I_MUTEX_PARENT2); return p; } // p is the root of connected component that contains p1 // p2 does not occur on the path from p to p1 while ((r = q->d_parent) != p1 && r != p && r != q) q = r; if (r == p1) { // q is a child of p1 and an ancestor of p2 or p2 itself inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2); return q; } else if (likely(r == p)) { // both p2 and p1 are descendents of p inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2); return NULL; } else { // no common ancestor at the time we'd been called mutex_unlock(&p1->d_sb->s_vfs_rename_mutex); return ERR_PTR(-EXDEV); } } /* * p1 and p2 should be directories on the same fs. */ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) { if (p1 == p2) { inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); return NULL; } mutex_lock(&p1->d_sb->s_vfs_rename_mutex); return lock_two_directories(p1, p2); } EXPORT_SYMBOL(lock_rename); /* * c1 and p2 should be on the same fs. */ struct dentry *lock_rename_child(struct dentry *c1, struct dentry *p2) { if (READ_ONCE(c1->d_parent) == p2) { /* * hopefully won't need to touch ->s_vfs_rename_mutex at all. */ inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); /* * now that p2 is locked, nobody can move in or out of it, * so the test below is safe. */ if (likely(c1->d_parent == p2)) return NULL; /* * c1 got moved out of p2 while we'd been taking locks; * unlock and fall back to slow case. */ inode_unlock(p2->d_inode); } mutex_lock(&c1->d_sb->s_vfs_rename_mutex); /* * nobody can move out of any directories on this fs. */ if (likely(c1->d_parent != p2)) return lock_two_directories(c1->d_parent, p2); /* * c1 got moved into p2 while we were taking locks; * we need p2 locked and ->s_vfs_rename_mutex unlocked, * for consistency with lock_rename(). */ inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); mutex_unlock(&c1->d_sb->s_vfs_rename_mutex); return NULL; } EXPORT_SYMBOL(lock_rename_child); void unlock_rename(struct dentry *p1, struct dentry *p2) { inode_unlock(p1->d_inode); if (p1 != p2) { inode_unlock(p2->d_inode); mutex_unlock(&p1->d_sb->s_vfs_rename_mutex); } } EXPORT_SYMBOL(unlock_rename); /** * vfs_prepare_mode - prepare the mode to be used for a new inode * @idmap: idmap of the mount the inode was found from * @dir: parent directory of the new inode * @mode: mode of the new inode * @mask_perms: allowed permission by the vfs * @type: type of file to be created * * This helper consolidates and enforces vfs restrictions on the @mode of a new * object to be created. * * Umask stripping depends on whether the filesystem supports POSIX ACLs (see * the kernel documentation for mode_strip_umask()). Moving umask stripping * after setgid stripping allows the same ordering for both non-POSIX ACL and * POSIX ACL supporting filesystems. * * Note that it's currently valid for @type to be 0 if a directory is created. * Filesystems raise that flag individually and we need to check whether each * filesystem can deal with receiving S_IFDIR from the vfs before we enforce a * non-zero type. * * Returns: mode to be passed to the filesystem */ static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap, const struct inode *dir, umode_t mode, umode_t mask_perms, umode_t type) { mode = mode_strip_sgid(idmap, dir, mode); mode = mode_strip_umask(dir, mode); /* * Apply the vfs mandated allowed permission mask and set the type of * file to be created before we call into the filesystem. */ mode &= (mask_perms & ~S_IFMT); mode |= (type & S_IFMT); return mode; } /** * vfs_create - create new file * @idmap: idmap of the mount the inode was found from * @dir: inode of the parent directory * @dentry: dentry of the child file * @mode: mode of the child file * @want_excl: whether the file must not yet exist * * Create a new file. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int vfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool want_excl) { int error; error = may_create(idmap, dir, dentry); if (error) return error; if (!dir->i_op->create) return -EACCES; /* shouldn't it be ENOSYS? */ mode = vfs_prepare_mode(idmap, dir, mode, S_IALLUGO, S_IFREG); error = security_inode_create(dir, dentry, mode); if (error) return error; error = dir->i_op->create(idmap, dir, dentry, mode, want_excl); if (!error) fsnotify_create(dir, dentry); return error; } EXPORT_SYMBOL(vfs_create); int vfs_mkobj(struct dentry *dentry, umode_t mode, int (*f)(struct dentry *, umode_t, void *), void *arg) { struct inode *dir = dentry->d_parent->d_inode; int error = may_create(&nop_mnt_idmap, dir, dentry); if (error) return error; mode &= S_IALLUGO; mode |= S_IFREG; error = security_inode_create(dir, dentry, mode); if (error) return error; error = f(dentry, mode, arg); if (!error) fsnotify_create(dir, dentry); return error; } EXPORT_SYMBOL(vfs_mkobj); bool may_open_dev(const struct path *path) { return !(path->mnt->mnt_flags & MNT_NODEV) && !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV); } static int may_open(struct mnt_idmap *idmap, const struct path *path, int acc_mode, int flag) { struct dentry *dentry = path->dentry; struct inode *inode = dentry->d_inode; int error; if (!inode) return -ENOENT; switch (inode->i_mode & S_IFMT) { case S_IFLNK: return -ELOOP; case S_IFDIR: if (acc_mode & MAY_WRITE) return -EISDIR; if (acc_mode & MAY_EXEC) return -EACCES; break; case S_IFBLK: case S_IFCHR: if (!may_open_dev(path)) return -EACCES; fallthrough; case S_IFIFO: case S_IFSOCK: if (acc_mode & MAY_EXEC) return -EACCES; flag &= ~O_TRUNC; break; case S_IFREG: if ((acc_mode & MAY_EXEC) && path_noexec(path)) return -EACCES; break; default: VFS_BUG_ON_INODE(1, inode); } error = inode_permission(idmap, inode, MAY_OPEN | acc_mode); if (error) return error; /* * An append-only file must be opened in append mode for writing. */ if (IS_APPEND(inode)) { if ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND)) return -EPERM; if (flag & O_TRUNC) return -EPERM; } /* O_NOATIME can only be set by the owner or superuser */ if (flag & O_NOATIME && !inode_owner_or_capable(idmap, inode)) return -EPERM; return 0; } static int handle_truncate(struct mnt_idmap *idmap, struct file *filp) { const struct path *path = &filp->f_path; struct inode *inode = path->dentry->d_inode; int error = get_write_access(inode); if (error) return error; error = security_file_truncate(filp); if (!error) { error = do_truncate(idmap, path->dentry, 0, ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, filp); } put_write_access(inode); return error; } static inline int open_to_namei_flags(int flag) { if ((flag & O_ACCMODE) == 3) flag--; return flag; } static int may_o_create(struct mnt_idmap *idmap, const struct path *dir, struct dentry *dentry, umode_t mode) { int error = security_path_mknod(dir, dentry, mode, 0); if (error) return error; if (!fsuidgid_has_mapping(dir->dentry->d_sb, idmap)) return -EOVERFLOW; error = inode_permission(idmap, dir->dentry->d_inode, MAY_WRITE | MAY_EXEC); if (error) return error; return security_inode_create(dir->dentry->d_inode, dentry, mode); } /* * Attempt to atomically look up, create and open a file from a negative * dentry. * * Returns 0 if successful. The file will have been created and attached to * @file by the filesystem calling finish_open(). * * If the file was looked up only or didn't need creating, FMODE_OPENED won't * be set. The caller will need to perform the open themselves. @path will * have been updated to point to the new dentry. This may be negative. * * Returns an error code otherwise. */ static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry, struct file *file, int open_flag, umode_t mode) { struct dentry *const DENTRY_NOT_SET = (void *) -1UL; struct inode *dir = nd->path.dentry->d_inode; int error; if (nd->flags & LOOKUP_DIRECTORY) open_flag |= O_DIRECTORY; file->f_path.dentry = DENTRY_NOT_SET; file->f_path.mnt = nd->path.mnt; error = dir->i_op->atomic_open(dir, dentry, file, open_to_namei_flags(open_flag), mode); d_lookup_done(dentry); if (!error) { if (file->f_mode & FMODE_OPENED) { if (unlikely(dentry != file->f_path.dentry)) { dput(dentry); dentry = dget(file->f_path.dentry); } } else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) { error = -EIO; } else { if (file->f_path.dentry) { dput(dentry); dentry = file->f_path.dentry; } if (unlikely(d_is_negative(dentry))) error = -ENOENT; } } if (error) { dput(dentry); dentry = ERR_PTR(error); } return dentry; } /* * Look up and maybe create and open the last component. * * Must be called with parent locked (exclusive in O_CREAT case). * * Returns 0 on success, that is, if * the file was successfully atomically created (if necessary) and opened, or * the file was not completely opened at this time, though lookups and * creations were performed. * These case are distinguished by presence of FMODE_OPENED on file->f_mode. * In the latter case dentry returned in @path might be negative if O_CREAT * hadn't been specified. * * An error code is returned on failure. */ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, const struct open_flags *op, bool got_write) { struct mnt_idmap *idmap; struct dentry *dir = nd->path.dentry; struct inode *dir_inode = dir->d_inode; int open_flag = op->open_flag; struct dentry *dentry; int error, create_error = 0; umode_t mode = op->mode; DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); if (unlikely(IS_DEADDIR(dir_inode))) return ERR_PTR(-ENOENT); file->f_mode &= ~FMODE_CREATED; dentry = d_lookup(dir, &nd->last); for (;;) { if (!dentry) { dentry = d_alloc_parallel(dir, &nd->last, &wq); if (IS_ERR(dentry)) return dentry; } if (d_in_lookup(dentry)) break; error = d_revalidate(dir_inode, &nd->last, dentry, nd->flags); if (likely(error > 0)) break; if (error) goto out_dput; d_invalidate(dentry); dput(dentry); dentry = NULL; } if (dentry->d_inode) { /* Cached positive dentry: will open in f_op->open */ return dentry; } if (open_flag & O_CREAT) audit_inode(nd->name, dir, AUDIT_INODE_PARENT); /* * Checking write permission is tricky, bacuse we don't know if we are * going to actually need it: O_CREAT opens should work as long as the * file exists. But checking existence breaks atomicity. The trick is * to check access and if not granted clear O_CREAT from the flags. * * Another problem is returing the "right" error value (e.g. for an * O_EXCL open we want to return EEXIST not EROFS). */ if (unlikely(!got_write)) open_flag &= ~O_TRUNC; idmap = mnt_idmap(nd->path.mnt); if (open_flag & O_CREAT) { if (open_flag & O_EXCL) open_flag &= ~O_TRUNC; mode = vfs_prepare_mode(idmap, dir->d_inode, mode, mode, mode); if (likely(got_write)) create_error = may_o_create(idmap, &nd->path, dentry, mode); else create_error = -EROFS; } if (create_error) open_flag &= ~O_CREAT; if (dir_inode->i_op->atomic_open) { dentry = atomic_open(nd, dentry, file, open_flag, mode); if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT)) dentry = ERR_PTR(create_error); return dentry; } if (d_in_lookup(dentry)) { struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry, nd->flags); d_lookup_done(dentry); if (unlikely(res)) { if (IS_ERR(res)) { error = PTR_ERR(res); goto out_dput; } dput(dentry); dentry = res; } } /* Negative dentry, just create the file */ if (!dentry->d_inode && (open_flag & O_CREAT)) { file->f_mode |= FMODE_CREATED; audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE); if (!dir_inode->i_op->create) { error = -EACCES; goto out_dput; } error = dir_inode->i_op->create(idmap, dir_inode, dentry, mode, open_flag & O_EXCL); if (error) goto out_dput; } if (unlikely(create_error) && !dentry->d_inode) { error = create_error; goto out_dput; } return dentry; out_dput: dput(dentry); return ERR_PTR(error); } static inline bool trailing_slashes(struct nameidata *nd) { return (bool)nd->last.name[nd->last.len]; } static struct dentry *lookup_fast_for_open(struct nameidata *nd, int open_flag) { struct dentry *dentry; if (open_flag & O_CREAT) { if (trailing_slashes(nd)) return ERR_PTR(-EISDIR); /* Don't bother on an O_EXCL create */ if (open_flag & O_EXCL) return NULL; } if (trailing_slashes(nd)) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; dentry = lookup_fast(nd); if (IS_ERR_OR_NULL(dentry)) return dentry; if (open_flag & O_CREAT) { /* Discard negative dentries. Need inode_lock to do the create */ if (!dentry->d_inode) { if (!(nd->flags & LOOKUP_RCU)) dput(dentry); dentry = NULL; } } return dentry; } static const char *open_last_lookups(struct nameidata *nd, struct file *file, const struct open_flags *op) { struct dentry *dir = nd->path.dentry; int open_flag = op->open_flag; bool got_write = false; struct dentry *dentry; const char *res; nd->flags |= op->intent; if (nd->last_type != LAST_NORM) { if (nd->depth) put_link(nd); return handle_dots(nd, nd->last_type); } /* We _can_ be in RCU mode here */ dentry = lookup_fast_for_open(nd, open_flag); if (IS_ERR(dentry)) return ERR_CAST(dentry); if (likely(dentry)) goto finish_lookup; if (!(open_flag & O_CREAT)) { if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU)) return ERR_PTR(-ECHILD); } else { if (nd->flags & LOOKUP_RCU) { if (!try_to_unlazy(nd)) return ERR_PTR(-ECHILD); } } if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { got_write = !mnt_want_write(nd->path.mnt); /* * do _not_ fail yet - we might not need that or fail with * a different error; let lookup_open() decide; we'll be * dropping this one anyway. */ } if (open_flag & O_CREAT) inode_lock(dir->d_inode); else inode_lock_shared(dir->d_inode); dentry = lookup_open(nd, file, op, got_write); if (!IS_ERR(dentry)) { if (file->f_mode & FMODE_CREATED) fsnotify_create(dir->d_inode, dentry); if (file->f_mode & FMODE_OPENED) fsnotify_open(file); } if (open_flag & O_CREAT) inode_unlock(dir->d_inode); else inode_unlock_shared(dir->d_inode); if (got_write) mnt_drop_write(nd->path.mnt); if (IS_ERR(dentry)) return ERR_CAST(dentry); if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) { dput(nd->path.dentry); nd->path.dentry = dentry; return NULL; } finish_lookup: if (nd->depth) put_link(nd); res = step_into(nd, WALK_TRAILING, dentry); if (unlikely(res)) nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); return res; } /* * Handle the last step of open() */ static int do_open(struct nameidata *nd, struct file *file, const struct open_flags *op) { struct mnt_idmap *idmap; int open_flag = op->open_flag; bool do_truncate; int acc_mode; int error; if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) { error = complete_walk(nd); if (error) return error; } if (!(file->f_mode & FMODE_CREATED)) audit_inode(nd->name, nd->path.dentry, 0); idmap = mnt_idmap(nd->path.mnt); if (open_flag & O_CREAT) { if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED)) return -EEXIST; if (d_is_dir(nd->path.dentry)) return -EISDIR; error = may_create_in_sticky(idmap, nd, d_backing_inode(nd->path.dentry)); if (unlikely(error)) return error; } if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry)) return -ENOTDIR; do_truncate = false; acc_mode = op->acc_mode; if (file->f_mode & FMODE_CREATED) { /* Don't check for write permission, don't truncate */ open_flag &= ~O_TRUNC; acc_mode = 0; } else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) { error = mnt_want_write(nd->path.mnt); if (error) return error; do_truncate = true; } error = may_open(idmap, &nd->path, acc_mode, open_flag); if (!error && !(file->f_mode & FMODE_OPENED)) error = vfs_open(&nd->path, file); if (!error) error = security_file_post_open(file, op->acc_mode); if (!error && do_truncate) error = handle_truncate(idmap, file); if (unlikely(error > 0)) { WARN_ON(1); error = -EINVAL; } if (do_truncate) mnt_drop_write(nd->path.mnt); return error; } /** * vfs_tmpfile - create tmpfile * @idmap: idmap of the mount the inode was found from * @parentpath: pointer to the path of the base directory * @file: file descriptor of the new tmpfile * @mode: mode of the new tmpfile * * Create a temporary file. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int vfs_tmpfile(struct mnt_idmap *idmap, const struct path *parentpath, struct file *file, umode_t mode) { struct dentry *child; struct inode *dir = d_inode(parentpath->dentry); struct inode *inode; int error; int open_flag = file->f_flags; /* we want directory to be writable */ error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); if (error) return error; if (!dir->i_op->tmpfile) return -EOPNOTSUPP; child = d_alloc(parentpath->dentry, &slash_name); if (unlikely(!child)) return -ENOMEM; file->f_path.mnt = parentpath->mnt; file->f_path.dentry = child; mode = vfs_prepare_mode(idmap, dir, mode, mode, mode); error = dir->i_op->tmpfile(idmap, dir, file, mode); dput(child); if (file->f_mode & FMODE_OPENED) fsnotify_open(file); if (error) return error; /* Don't check for other permissions, the inode was just created */ error = may_open(idmap, &file->f_path, 0, file->f_flags); if (error) return error; inode = file_inode(file); if (!(open_flag & O_EXCL)) { spin_lock(&inode->i_lock); inode->i_state |= I_LINKABLE; spin_unlock(&inode->i_lock); } security_inode_post_create_tmpfile(idmap, inode); return 0; } /** * kernel_tmpfile_open - open a tmpfile for kernel internal use * @idmap: idmap of the mount the inode was found from * @parentpath: path of the base directory * @mode: mode of the new tmpfile * @open_flag: flags * @cred: credentials for open * * Create and open a temporary file. The file is not accounted in nr_files, * hence this is only for kernel internal use, and must not be installed into * file tables or such. */ struct file *kernel_tmpfile_open(struct mnt_idmap *idmap, const struct path *parentpath, umode_t mode, int open_flag, const struct cred *cred) { struct file *file; int error; file = alloc_empty_file_noaccount(open_flag, cred); if (IS_ERR(file)) return file; error = vfs_tmpfile(idmap, parentpath, file, mode); if (error) { fput(file); file = ERR_PTR(error); } return file; } EXPORT_SYMBOL(kernel_tmpfile_open); static int do_tmpfile(struct nameidata *nd, unsigned flags, const struct open_flags *op, struct file *file) { struct path path; int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path); if (unlikely(error)) return error; error = mnt_want_write(path.mnt); if (unlikely(error)) goto out; error = vfs_tmpfile(mnt_idmap(path.mnt), &path, file, op->mode); if (error) goto out2; audit_inode(nd->name, file->f_path.dentry, 0); out2: mnt_drop_write(path.mnt); out: path_put(&path); return error; } static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file) { struct path path; int error = path_lookupat(nd, flags, &path); if (!error) { audit_inode(nd->name, path.dentry, 0); error = vfs_open(&path, file); path_put(&path); } return error; } static struct file *path_openat(struct nameidata *nd, const struct open_flags *op, unsigned flags) { struct file *file; int error; file = alloc_empty_file(op->open_flag, current_cred()); if (IS_ERR(file)) return file; if (unlikely(file->f_flags & __O_TMPFILE)) { error = do_tmpfile(nd, flags, op, file); } else if (unlikely(file->f_flags & O_PATH)) { error = do_o_path(nd, flags, file); } else { const char *s = path_init(nd, flags); while (!(error = link_path_walk(s, nd)) && (s = open_last_lookups(nd, file, op)) != NULL) ; if (!error) error = do_open(nd, file, op); terminate_walk(nd); } if (likely(!error)) { if (likely(file->f_mode & FMODE_OPENED)) return file; WARN_ON(1); error = -EINVAL; } fput_close(file); if (error == -EOPENSTALE) { if (flags & LOOKUP_RCU) error = -ECHILD; else error = -ESTALE; } return ERR_PTR(error); } struct file *do_filp_open(int dfd, struct filename *pathname, const struct open_flags *op) { struct nameidata nd; int flags = op->lookup_flags; struct file *filp; set_nameidata(&nd, dfd, pathname, NULL); filp = path_openat(&nd, op, flags | LOOKUP_RCU); if (unlikely(filp == ERR_PTR(-ECHILD))) filp = path_openat(&nd, op, flags); if (unlikely(filp == ERR_PTR(-ESTALE))) filp = path_openat(&nd, op, flags | LOOKUP_REVAL); restore_nameidata(); return filp; } struct file *do_file_open_root(const struct path *root, const char *name, const struct open_flags *op) { struct nameidata nd; struct file *file; struct filename *filename; int flags = op->lookup_flags; if (d_is_symlink(root->dentry) && op->intent & LOOKUP_OPEN) return ERR_PTR(-ELOOP); filename = getname_kernel(name); if (IS_ERR(filename)) return ERR_CAST(filename); set_nameidata(&nd, -1, filename, root); file = path_openat(&nd, op, flags | LOOKUP_RCU); if (unlikely(file == ERR_PTR(-ECHILD))) file = path_openat(&nd, op, flags); if (unlikely(file == ERR_PTR(-ESTALE))) file = path_openat(&nd, op, flags | LOOKUP_REVAL); restore_nameidata(); putname(filename); return file; } static struct dentry *filename_create(int dfd, struct filename *name, struct path *path, unsigned int lookup_flags) { struct dentry *dentry = ERR_PTR(-EEXIST); struct qstr last; bool want_dir = lookup_flags & LOOKUP_DIRECTORY; unsigned int reval_flag = lookup_flags & LOOKUP_REVAL; unsigned int create_flags = LOOKUP_CREATE | LOOKUP_EXCL; int type; int err2; int error; error = filename_parentat(dfd, name, reval_flag, path, &last, &type); if (error) return ERR_PTR(error); /* * Yucky last component or no last component at all? * (foo/., foo/.., /////) */ if (unlikely(type != LAST_NORM)) goto out; /* don't fail immediately if it's r/o, at least try to report other errors */ err2 = mnt_want_write(path->mnt); /* * Do the final lookup. Suppress 'create' if there is a trailing * '/', and a directory wasn't requested. */ if (last.name[last.len] && !want_dir) create_flags &= ~LOOKUP_CREATE; inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); dentry = lookup_one_qstr_excl(&last, path->dentry, reval_flag | create_flags); if (IS_ERR(dentry)) goto unlock; if (unlikely(err2)) { error = err2; goto fail; } return dentry; fail: dput(dentry); dentry = ERR_PTR(error); unlock: inode_unlock(path->dentry->d_inode); if (!err2) mnt_drop_write(path->mnt); out: path_put(path); return dentry; } struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path, unsigned int lookup_flags) { struct filename *filename = getname_kernel(pathname); struct dentry *res = filename_create(dfd, filename, path, lookup_flags); putname(filename); return res; } EXPORT_SYMBOL(kern_path_create); void done_path_create(struct path *path, struct dentry *dentry) { if (!IS_ERR(dentry)) dput(dentry); inode_unlock(path->dentry->d_inode); mnt_drop_write(path->mnt); path_put(path); } EXPORT_SYMBOL(done_path_create); inline struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, unsigned int lookup_flags) { struct filename *filename = getname(pathname); struct dentry *res = filename_create(dfd, filename, path, lookup_flags); putname(filename); return res; } EXPORT_SYMBOL(user_path_create); /** * vfs_mknod - create device node or file * @idmap: idmap of the mount the inode was found from * @dir: inode of the parent directory * @dentry: dentry of the child device node * @mode: mode of the child device node * @dev: device number of device to create * * Create a device node or file. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV; int error = may_create(idmap, dir, dentry); if (error) return error; if ((S_ISCHR(mode) || S_ISBLK(mode)) && !is_whiteout && !capable(CAP_MKNOD)) return -EPERM; if (!dir->i_op->mknod) return -EPERM; mode = vfs_prepare_mode(idmap, dir, mode, mode, mode); error = devcgroup_inode_mknod(mode, dev); if (error) return error; error = security_inode_mknod(dir, dentry, mode, dev); if (error) return error; error = dir->i_op->mknod(idmap, dir, dentry, mode, dev); if (!error) fsnotify_create(dir, dentry); return error; } EXPORT_SYMBOL(vfs_mknod); static int may_mknod(umode_t mode) { switch (mode & S_IFMT) { case S_IFREG: case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK: case 0: /* zero mode translates to S_IFREG */ return 0; case S_IFDIR: return -EPERM; default: return -EINVAL; } } static int do_mknodat(int dfd, struct filename *name, umode_t mode, unsigned int dev) { struct mnt_idmap *idmap; struct dentry *dentry; struct path path; int error; unsigned int lookup_flags = 0; error = may_mknod(mode); if (error) goto out1; retry: dentry = filename_create(dfd, name, &path, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out1; error = security_path_mknod(&path, dentry, mode_strip_umask(path.dentry->d_inode, mode), dev); if (error) goto out2; idmap = mnt_idmap(path.mnt); switch (mode & S_IFMT) { case 0: case S_IFREG: error = vfs_create(idmap, path.dentry->d_inode, dentry, mode, true); if (!error) security_path_post_mknod(idmap, dentry); break; case S_IFCHR: case S_IFBLK: error = vfs_mknod(idmap, path.dentry->d_inode, dentry, mode, new_decode_dev(dev)); break; case S_IFIFO: case S_IFSOCK: error = vfs_mknod(idmap, path.dentry->d_inode, dentry, mode, 0); break; } out2: done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out1: putname(name); return error; } SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, unsigned int, dev) { return do_mknodat(dfd, getname(filename), mode, dev); } SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev) { return do_mknodat(AT_FDCWD, getname(filename), mode, dev); } /** * vfs_mkdir - create directory returning correct dentry if possible * @idmap: idmap of the mount the inode was found from * @dir: inode of the parent directory * @dentry: dentry of the child directory * @mode: mode of the child directory * * Create a directory. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. * * In the event that the filesystem does not use the *@dentry but leaves it * negative or unhashes it and possibly splices a different one returning it, * the original dentry is dput() and the alternate is returned. * * In case of an error the dentry is dput() and an ERR_PTR() is returned. */ struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { int error; unsigned max_links = dir->i_sb->s_max_links; struct dentry *de; error = may_create(idmap, dir, dentry); if (error) goto err; error = -EPERM; if (!dir->i_op->mkdir) goto err; mode = vfs_prepare_mode(idmap, dir, mode, S_IRWXUGO | S_ISVTX, 0); error = security_inode_mkdir(dir, dentry, mode); if (error) goto err; error = -EMLINK; if (max_links && dir->i_nlink >= max_links) goto err; de = dir->i_op->mkdir(idmap, dir, dentry, mode); error = PTR_ERR(de); if (IS_ERR(de)) goto err; if (de) { dput(dentry); dentry = de; } fsnotify_mkdir(dir, dentry); return dentry; err: dput(dentry); return ERR_PTR(error); } EXPORT_SYMBOL(vfs_mkdir); int do_mkdirat(int dfd, struct filename *name, umode_t mode) { struct dentry *dentry; struct path path; int error; unsigned int lookup_flags = LOOKUP_DIRECTORY; retry: dentry = filename_create(dfd, name, &path, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_putname; error = security_path_mkdir(&path, dentry, mode_strip_umask(path.dentry->d_inode, mode)); if (!error) { dentry = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, mode); if (IS_ERR(dentry)) error = PTR_ERR(dentry); } done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out_putname: putname(name); return error; } SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) { return do_mkdirat(dfd, getname(pathname), mode); } SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode) { return do_mkdirat(AT_FDCWD, getname(pathname), mode); } /** * vfs_rmdir - remove directory * @idmap: idmap of the mount the inode was found from * @dir: inode of the parent directory * @dentry: dentry of the child directory * * Remove a directory. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry) { int error = may_delete(idmap, dir, dentry, 1); if (error) return error; if (!dir->i_op->rmdir) return -EPERM; dget(dentry); inode_lock(dentry->d_inode); error = -EBUSY; if (is_local_mountpoint(dentry) || (dentry->d_inode->i_flags & S_KERNEL_FILE)) goto out; error = security_inode_rmdir(dir, dentry); if (error) goto out; error = dir->i_op->rmdir(dir, dentry); if (error) goto out; shrink_dcache_parent(dentry); dentry->d_inode->i_flags |= S_DEAD; dont_mount(dentry); detach_mounts(dentry); out: inode_unlock(dentry->d_inode); dput(dentry); if (!error) d_delete_notify(dir, dentry); return error; } EXPORT_SYMBOL(vfs_rmdir); int do_rmdir(int dfd, struct filename *name) { int error; struct dentry *dentry; struct path path; struct qstr last; int type; unsigned int lookup_flags = 0; retry: error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type); if (error) goto exit1; switch (type) { case LAST_DOTDOT: error = -ENOTEMPTY; goto exit2; case LAST_DOT: error = -EINVAL; goto exit2; case LAST_ROOT: error = -EBUSY; goto exit2; } error = mnt_want_write(path.mnt); if (error) goto exit2; inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto exit3; error = security_path_rmdir(&path, dentry); if (error) goto exit4; error = vfs_rmdir(mnt_idmap(path.mnt), path.dentry->d_inode, dentry); exit4: dput(dentry); exit3: inode_unlock(path.dentry->d_inode); mnt_drop_write(path.mnt); exit2: path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } exit1: putname(name); return error; } SYSCALL_DEFINE1(rmdir, const char __user *, pathname) { return do_rmdir(AT_FDCWD, getname(pathname)); } /** * vfs_unlink - unlink a filesystem object * @idmap: idmap of the mount the inode was found from * @dir: parent directory * @dentry: victim * @delegated_inode: returns victim inode, if the inode is delegated. * * The caller must hold dir->i_mutex. * * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and * return a reference to the inode in delegated_inode. The caller * should then break the delegation on that inode and retry. Because * breaking a delegation may take a long time, the caller should drop * dir->i_mutex before doing so. * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, struct inode **delegated_inode) { struct inode *target = dentry->d_inode; int error = may_delete(idmap, dir, dentry, 0); if (error) return error; if (!dir->i_op->unlink) return -EPERM; inode_lock(target); if (IS_SWAPFILE(target)) error = -EPERM; else if (is_local_mountpoint(dentry)) error = -EBUSY; else { error = security_inode_unlink(dir, dentry); if (!error) { error = try_break_deleg(target, delegated_inode); if (error) goto out; error = dir->i_op->unlink(dir, dentry); if (!error) { dont_mount(dentry); detach_mounts(dentry); } } } out: inode_unlock(target); /* We don't d_delete() NFS sillyrenamed files--they still exist. */ if (!error && dentry->d_flags & DCACHE_NFSFS_RENAMED) { fsnotify_unlink(dir, dentry); } else if (!error) { fsnotify_link_count(target); d_delete_notify(dir, dentry); } return error; } EXPORT_SYMBOL(vfs_unlink); /* * Make sure that the actual truncation of the file will occur outside its * directory's i_mutex. Truncate can take a long time if there is a lot of * writeout happening, and we don't want to prevent access to the directory * while waiting on the I/O. */ int do_unlinkat(int dfd, struct filename *name) { int error; struct dentry *dentry; struct path path; struct qstr last; int type; struct inode *inode = NULL; struct inode *delegated_inode = NULL; unsigned int lookup_flags = 0; retry: error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type); if (error) goto exit1; error = -EISDIR; if (type != LAST_NORM) goto exit2; error = mnt_want_write(path.mnt); if (error) goto exit2; retry_deleg: inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { /* Why not before? Because we want correct error value */ if (last.name[last.len]) goto slashes; inode = dentry->d_inode; ihold(inode); error = security_path_unlink(&path, dentry); if (error) goto exit3; error = vfs_unlink(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, &delegated_inode); exit3: dput(dentry); } inode_unlock(path.dentry->d_inode); if (inode) iput(inode); /* truncate the inode here */ inode = NULL; if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } mnt_drop_write(path.mnt); exit2: path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; inode = NULL; goto retry; } exit1: putname(name); return error; slashes: if (d_is_dir(dentry)) error = -EISDIR; else error = -ENOTDIR; goto exit3; } SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag) { if ((flag & ~AT_REMOVEDIR) != 0) return -EINVAL; if (flag & AT_REMOVEDIR) return do_rmdir(dfd, getname(pathname)); return do_unlinkat(dfd, getname(pathname)); } SYSCALL_DEFINE1(unlink, const char __user *, pathname) { return do_unlinkat(AT_FDCWD, getname(pathname)); } /** * vfs_symlink - create symlink * @idmap: idmap of the mount the inode was found from * @dir: inode of the parent directory * @dentry: dentry of the child symlink file * @oldname: name of the file to link to * * Create a symlink. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *oldname) { int error; error = may_create(idmap, dir, dentry); if (error) return error; if (!dir->i_op->symlink) return -EPERM; error = security_inode_symlink(dir, dentry, oldname); if (error) return error; error = dir->i_op->symlink(idmap, dir, dentry, oldname); if (!error) fsnotify_create(dir, dentry); return error; } EXPORT_SYMBOL(vfs_symlink); int do_symlinkat(struct filename *from, int newdfd, struct filename *to) { int error; struct dentry *dentry; struct path path; unsigned int lookup_flags = 0; if (IS_ERR(from)) { error = PTR_ERR(from); goto out_putnames; } retry: dentry = filename_create(newdfd, to, &path, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_putnames; error = security_path_symlink(&path, dentry, from->name); if (!error) error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, from->name); done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out_putnames: putname(to); putname(from); return error; } SYSCALL_DEFINE3(symlinkat, const char __user *, oldname, int, newdfd, const char __user *, newname) { return do_symlinkat(getname(oldname), newdfd, getname(newname)); } SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname) { return do_symlinkat(getname(oldname), AT_FDCWD, getname(newname)); } /** * vfs_link - create a new link * @old_dentry: object to be linked * @idmap: idmap of the mount * @dir: new parent * @new_dentry: where to create the new link * @delegated_inode: returns inode needing a delegation break * * The caller must hold dir->i_mutex * * If vfs_link discovers a delegation on the to-be-linked file in need * of breaking, it will return -EWOULDBLOCK and return a reference to the * inode in delegated_inode. The caller should then break the delegation * and retry. Because breaking a delegation may take a long time, the * caller should drop the i_mutex before doing so. * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode) { struct inode *inode = old_dentry->d_inode; unsigned max_links = dir->i_sb->s_max_links; int error; if (!inode) return -ENOENT; error = may_create(idmap, dir, new_dentry); if (error) return error; if (dir->i_sb != inode->i_sb) return -EXDEV; /* * A link to an append-only or immutable file cannot be created. */ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return -EPERM; /* * Updating the link count will likely cause i_uid and i_gid to * be writen back improperly if their true value is unknown to * the vfs. */ if (HAS_UNMAPPED_ID(idmap, inode)) return -EPERM; if (!dir->i_op->link) return -EPERM; if (S_ISDIR(inode->i_mode)) return -EPERM; error = security_inode_link(old_dentry, dir, new_dentry); if (error) return error; inode_lock(inode); /* Make sure we don't allow creating hardlink to an unlinked file */ if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) error = -ENOENT; else if (max_links && inode->i_nlink >= max_links) error = -EMLINK; else { error = try_break_deleg(inode, delegated_inode); if (!error) error = dir->i_op->link(old_dentry, dir, new_dentry); } if (!error && (inode->i_state & I_LINKABLE)) { spin_lock(&inode->i_lock); inode->i_state &= ~I_LINKABLE; spin_unlock(&inode->i_lock); } inode_unlock(inode); if (!error) fsnotify_link(dir, inode, new_dentry); return error; } EXPORT_SYMBOL(vfs_link); /* * Hardlinks are often used in delicate situations. We avoid * security-related surprises by not following symlinks on the * newname. --KAB * * We don't follow them on the oldname either to be compatible * with linux 2.0, and to avoid hard-linking to directories * and other special files. --ADM */ int do_linkat(int olddfd, struct filename *old, int newdfd, struct filename *new, int flags) { struct mnt_idmap *idmap; struct dentry *new_dentry; struct path old_path, new_path; struct inode *delegated_inode = NULL; int how = 0; int error; if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) { error = -EINVAL; goto out_putnames; } /* * To use null names we require CAP_DAC_READ_SEARCH or * that the open-time creds of the dfd matches current. * This ensures that not everyone will be able to create * a hardlink using the passed file descriptor. */ if (flags & AT_EMPTY_PATH) how |= LOOKUP_LINKAT_EMPTY; if (flags & AT_SYMLINK_FOLLOW) how |= LOOKUP_FOLLOW; retry: error = filename_lookup(olddfd, old, how, &old_path, NULL); if (error) goto out_putnames; new_dentry = filename_create(newdfd, new, &new_path, (how & LOOKUP_REVAL)); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) goto out_putpath; error = -EXDEV; if (old_path.mnt != new_path.mnt) goto out_dput; idmap = mnt_idmap(new_path.mnt); error = may_linkat(idmap, &old_path); if (unlikely(error)) goto out_dput; error = security_path_link(old_path.dentry, &new_path, new_dentry); if (error) goto out_dput; error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode, new_dentry, &delegated_inode); out_dput: done_path_create(&new_path, new_dentry); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) { path_put(&old_path); goto retry; } } if (retry_estale(error, how)) { path_put(&old_path); how |= LOOKUP_REVAL; goto retry; } out_putpath: path_put(&old_path); out_putnames: putname(old); putname(new); return error; } SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname, int, flags) { return do_linkat(olddfd, getname_uflags(oldname, flags), newdfd, getname(newname), flags); } SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname) { return do_linkat(AT_FDCWD, getname(oldname), AT_FDCWD, getname(newname), 0); } /** * vfs_rename - rename a filesystem object * @rd: pointer to &struct renamedata info * * The caller must hold multiple mutexes--see lock_rename()). * * If vfs_rename discovers a delegation in need of breaking at either * the source or destination, it will return -EWOULDBLOCK and return a * reference to the inode in delegated_inode. The caller should then * break the delegation and retry. Because breaking a delegation may * take a long time, the caller should drop all locks before doing * so. * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. * * The worst of all namespace operations - renaming directory. "Perverted" * doesn't even start to describe it. Somebody in UCB had a heck of a trip... * Problems: * * a) we can get into loop creation. * b) race potential - two innocent renames can create a loop together. * That's where 4.4BSD screws up. Current fix: serialization on * sb->s_vfs_rename_mutex. We might be more accurate, but that's another * story. * c) we may have to lock up to _four_ objects - parents and victim (if it exists), * and source (if it's a non-directory or a subdirectory that moves to * different parent). * And that - after we got ->i_mutex on parents (until then we don't know * whether the target exists). Solution: try to be smart with locking * order for inodes. We rely on the fact that tree topology may change * only under ->s_vfs_rename_mutex _and_ that parent of the object we * move will be locked. Thus we can rank directories by the tree * (ancestors first) and rank all non-directories after them. * That works since everybody except rename does "lock parent, lookup, * lock child" and rename is under ->s_vfs_rename_mutex. * HOWEVER, it relies on the assumption that any object with ->lookup() * has no more than 1 dentry. If "hybrid" objects will ever appear, * we'd better make sure that there's no link(2) for them. * d) conversion from fhandle to dentry may come in the wrong moment - when * we are removing the target. Solution: we will have to grab ->i_mutex * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on * ->i_mutex on parents, which works but leads to some truly excessive * locking]. */ int vfs_rename(struct renamedata *rd) { int error; struct inode *old_dir = rd->old_dir, *new_dir = rd->new_dir; struct dentry *old_dentry = rd->old_dentry; struct dentry *new_dentry = rd->new_dentry; struct inode **delegated_inode = rd->delegated_inode; unsigned int flags = rd->flags; bool is_dir = d_is_dir(old_dentry); struct inode *source = old_dentry->d_inode; struct inode *target = new_dentry->d_inode; bool new_is_dir = false; unsigned max_links = new_dir->i_sb->s_max_links; struct name_snapshot old_name; bool lock_old_subdir, lock_new_subdir; if (source == target) return 0; error = may_delete(rd->old_mnt_idmap, old_dir, old_dentry, is_dir); if (error) return error; if (!target) { error = may_create(rd->new_mnt_idmap, new_dir, new_dentry); } else { new_is_dir = d_is_dir(new_dentry); if (!(flags & RENAME_EXCHANGE)) error = may_delete(rd->new_mnt_idmap, new_dir, new_dentry, is_dir); else error = may_delete(rd->new_mnt_idmap, new_dir, new_dentry, new_is_dir); } if (error) return error; if (!old_dir->i_op->rename) return -EPERM; /* * If we are going to change the parent - check write permissions, * we'll need to flip '..'. */ if (new_dir != old_dir) { if (is_dir) { error = inode_permission(rd->old_mnt_idmap, source, MAY_WRITE); if (error) return error; } if ((flags & RENAME_EXCHANGE) && new_is_dir) { error = inode_permission(rd->new_mnt_idmap, target, MAY_WRITE); if (error) return error; } } error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry, flags); if (error) return error; take_dentry_name_snapshot(&old_name, old_dentry); dget(new_dentry); /* * Lock children. * The source subdirectory needs to be locked on cross-directory * rename or cross-directory exchange since its parent changes. * The target subdirectory needs to be locked on cross-directory * exchange due to parent change and on any rename due to becoming * a victim. * Non-directories need locking in all cases (for NFS reasons); * they get locked after any subdirectories (in inode address order). * * NOTE: WE ONLY LOCK UNRELATED DIRECTORIES IN CROSS-DIRECTORY CASE. * NEVER, EVER DO THAT WITHOUT ->s_vfs_rename_mutex. */ lock_old_subdir = new_dir != old_dir; lock_new_subdir = new_dir != old_dir || !(flags & RENAME_EXCHANGE); if (is_dir) { if (lock_old_subdir) inode_lock_nested(source, I_MUTEX_CHILD); if (target && (!new_is_dir || lock_new_subdir)) inode_lock(target); } else if (new_is_dir) { if (lock_new_subdir) inode_lock_nested(target, I_MUTEX_CHILD); inode_lock(source); } else { lock_two_nondirectories(source, target); } error = -EPERM; if (IS_SWAPFILE(source) || (target && IS_SWAPFILE(target))) goto out; error = -EBUSY; if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry)) goto out; if (max_links && new_dir != old_dir) { error = -EMLINK; if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links) goto out; if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir && old_dir->i_nlink >= max_links) goto out; } if (!is_dir) { error = try_break_deleg(source, delegated_inode); if (error) goto out; } if (target && !new_is_dir) { error = try_break_deleg(target, delegated_inode); if (error) goto out; } error = old_dir->i_op->rename(rd->new_mnt_idmap, old_dir, old_dentry, new_dir, new_dentry, flags); if (error) goto out; if (!(flags & RENAME_EXCHANGE) && target) { if (is_dir) { shrink_dcache_parent(new_dentry); target->i_flags |= S_DEAD; } dont_mount(new_dentry); detach_mounts(new_dentry); } if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) { if (!(flags & RENAME_EXCHANGE)) d_move(old_dentry, new_dentry); else d_exchange(old_dentry, new_dentry); } out: if (!is_dir || lock_old_subdir) inode_unlock(source); if (target && (!new_is_dir || lock_new_subdir)) inode_unlock(target); dput(new_dentry); if (!error) { fsnotify_move(old_dir, new_dir, &old_name.name, is_dir, !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry); if (flags & RENAME_EXCHANGE) { fsnotify_move(new_dir, old_dir, &old_dentry->d_name, new_is_dir, NULL, new_dentry); } } release_dentry_name_snapshot(&old_name); return error; } EXPORT_SYMBOL(vfs_rename); int do_renameat2(int olddfd, struct filename *from, int newdfd, struct filename *to, unsigned int flags) { struct renamedata rd; struct dentry *old_dentry, *new_dentry; struct dentry *trap; struct path old_path, new_path; struct qstr old_last, new_last; int old_type, new_type; struct inode *delegated_inode = NULL; unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET | LOOKUP_CREATE; bool should_retry = false; int error = -EINVAL; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) goto put_names; if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) && (flags & RENAME_EXCHANGE)) goto put_names; if (flags & RENAME_EXCHANGE) target_flags = 0; if (flags & RENAME_NOREPLACE) target_flags |= LOOKUP_EXCL; retry: error = filename_parentat(olddfd, from, lookup_flags, &old_path, &old_last, &old_type); if (error) goto put_names; error = filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last, &new_type); if (error) goto exit1; error = -EXDEV; if (old_path.mnt != new_path.mnt) goto exit2; error = -EBUSY; if (old_type != LAST_NORM) goto exit2; if (flags & RENAME_NOREPLACE) error = -EEXIST; if (new_type != LAST_NORM) goto exit2; error = mnt_want_write(old_path.mnt); if (error) goto exit2; retry_deleg: trap = lock_rename(new_path.dentry, old_path.dentry); if (IS_ERR(trap)) { error = PTR_ERR(trap); goto exit_lock_rename; } old_dentry = lookup_one_qstr_excl(&old_last, old_path.dentry, lookup_flags); error = PTR_ERR(old_dentry); if (IS_ERR(old_dentry)) goto exit3; new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry, lookup_flags | target_flags); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) goto exit4; if (flags & RENAME_EXCHANGE) { if (!d_is_dir(new_dentry)) { error = -ENOTDIR; if (new_last.name[new_last.len]) goto exit5; } } /* unless the source is a directory trailing slashes give -ENOTDIR */ if (!d_is_dir(old_dentry)) { error = -ENOTDIR; if (old_last.name[old_last.len]) goto exit5; if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len]) goto exit5; } /* source should not be ancestor of target */ error = -EINVAL; if (old_dentry == trap) goto exit5; /* target should not be an ancestor of source */ if (!(flags & RENAME_EXCHANGE)) error = -ENOTEMPTY; if (new_dentry == trap) goto exit5; error = security_path_rename(&old_path, old_dentry, &new_path, new_dentry, flags); if (error) goto exit5; rd.old_dir = old_path.dentry->d_inode; rd.old_dentry = old_dentry; rd.old_mnt_idmap = mnt_idmap(old_path.mnt); rd.new_dir = new_path.dentry->d_inode; rd.new_dentry = new_dentry; rd.new_mnt_idmap = mnt_idmap(new_path.mnt); rd.delegated_inode = &delegated_inode; rd.flags = flags; error = vfs_rename(&rd); exit5: dput(new_dentry); exit4: dput(old_dentry); exit3: unlock_rename(new_path.dentry, old_path.dentry); exit_lock_rename: if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } mnt_drop_write(old_path.mnt); exit2: if (retry_estale(error, lookup_flags)) should_retry = true; path_put(&new_path); exit1: path_put(&old_path); if (should_retry) { should_retry = false; lookup_flags |= LOOKUP_REVAL; goto retry; } put_names: putname(from); putname(to); return error; } SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname, unsigned int, flags) { return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname), flags); } SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname) { return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname), 0); } SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname) { return do_renameat2(AT_FDCWD, getname(oldname), AT_FDCWD, getname(newname), 0); } int readlink_copy(char __user *buffer, int buflen, const char *link, int linklen) { int copylen; copylen = linklen; if (unlikely(copylen > (unsigned) buflen)) copylen = buflen; if (copy_to_user(buffer, link, copylen)) copylen = -EFAULT; return copylen; } /** * vfs_readlink - copy symlink body into userspace buffer * @dentry: dentry on which to get symbolic link * @buffer: user memory pointer * @buflen: size of buffer * * Does not touch atime. That's up to the caller if necessary * * Does not call security hook. */ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen) { struct inode *inode = d_inode(dentry); DEFINE_DELAYED_CALL(done); const char *link; int res; if (inode->i_opflags & IOP_CACHED_LINK) return readlink_copy(buffer, buflen, inode->i_link, inode->i_linklen); if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) { if (unlikely(inode->i_op->readlink)) return inode->i_op->readlink(dentry, buffer, buflen); if (!d_is_symlink(dentry)) return -EINVAL; spin_lock(&inode->i_lock); inode->i_opflags |= IOP_DEFAULT_READLINK; spin_unlock(&inode->i_lock); } link = READ_ONCE(inode->i_link); if (!link) { link = inode->i_op->get_link(dentry, inode, &done); if (IS_ERR(link)) return PTR_ERR(link); } res = readlink_copy(buffer, buflen, link, strlen(link)); do_delayed_call(&done); return res; } EXPORT_SYMBOL(vfs_readlink); /** * vfs_get_link - get symlink body * @dentry: dentry on which to get symbolic link * @done: caller needs to free returned data with this * * Calls security hook and i_op->get_link() on the supplied inode. * * It does not touch atime. That's up to the caller if necessary. * * Does not work on "special" symlinks like /proc/$$/fd/N */ const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done) { const char *res = ERR_PTR(-EINVAL); struct inode *inode = d_inode(dentry); if (d_is_symlink(dentry)) { res = ERR_PTR(security_inode_readlink(dentry)); if (!res) res = inode->i_op->get_link(dentry, inode, done); } return res; } EXPORT_SYMBOL(vfs_get_link); /* get the link contents into pagecache */ static char *__page_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *callback) { struct page *page; struct address_space *mapping = inode->i_mapping; if (!dentry) { page = find_get_page(mapping, 0); if (!page) return ERR_PTR(-ECHILD); if (!PageUptodate(page)) { put_page(page); return ERR_PTR(-ECHILD); } } else { page = read_mapping_page(mapping, 0, NULL); if (IS_ERR(page)) return (char*)page; } set_delayed_call(callback, page_put_link, page); BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM); return page_address(page); } const char *page_get_link_raw(struct dentry *dentry, struct inode *inode, struct delayed_call *callback) { return __page_get_link(dentry, inode, callback); } EXPORT_SYMBOL_GPL(page_get_link_raw); const char *page_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *callback) { char *kaddr = __page_get_link(dentry, inode, callback); if (!IS_ERR(kaddr)) nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1); return kaddr; } EXPORT_SYMBOL(page_get_link); void page_put_link(void *arg) { put_page(arg); } EXPORT_SYMBOL(page_put_link); int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) { const char *link; int res; DEFINE_DELAYED_CALL(done); link = page_get_link(dentry, d_inode(dentry), &done); res = PTR_ERR(link); if (!IS_ERR(link)) res = readlink_copy(buffer, buflen, link, strlen(link)); do_delayed_call(&done); return res; } EXPORT_SYMBOL(page_readlink); int page_symlink(struct inode *inode, const char *symname, int len) { struct address_space *mapping = inode->i_mapping; const struct address_space_operations *aops = mapping->a_ops; bool nofs = !mapping_gfp_constraint(mapping, __GFP_FS); struct folio *folio; void *fsdata = NULL; int err; unsigned int flags; retry: if (nofs) flags = memalloc_nofs_save(); err = aops->write_begin(NULL, mapping, 0, len-1, &folio, &fsdata); if (nofs) memalloc_nofs_restore(flags); if (err) goto fail; memcpy(folio_address(folio), symname, len - 1); err = aops->write_end(NULL, mapping, 0, len - 1, len - 1, folio, fsdata); if (err < 0) goto fail; if (err < len-1) goto retry; mark_inode_dirty(inode); return 0; fail: return err; } EXPORT_SYMBOL(page_symlink); const struct inode_operations page_symlink_inode_operations = { .get_link = page_get_link, }; EXPORT_SYMBOL(page_symlink_inode_operations);
565 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_HEX_H #define _LINUX_HEX_H #include <linux/types.h> extern const char hex_asc[]; #define hex_asc_lo(x) hex_asc[((x) & 0x0f)] #define hex_asc_hi(x) hex_asc[((x) & 0xf0) >> 4] static inline char *hex_byte_pack(char *buf, u8 byte) { *buf++ = hex_asc_hi(byte); *buf++ = hex_asc_lo(byte); return buf; } extern const char hex_asc_upper[]; #define hex_asc_upper_lo(x) hex_asc_upper[((x) & 0x0f)] #define hex_asc_upper_hi(x) hex_asc_upper[((x) & 0xf0) >> 4] static inline char *hex_byte_pack_upper(char *buf, u8 byte) { *buf++ = hex_asc_upper_hi(byte); *buf++ = hex_asc_upper_lo(byte); return buf; } extern int hex_to_bin(unsigned char ch); extern int __must_check hex2bin(u8 *dst, const char *src, size_t count); extern char *bin2hex(char *dst, const void *src, size_t count); bool mac_pton(const char *s, u8 *mac); #endif
17 19 3 16 30 30 30 28 1 3 1 1 1 1 23 21 22 2 17 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 // SPDX-License-Identifier: GPL-2.0-only #include <linux/module.h> #include <linux/errno.h> #include <linux/socket.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/udp.h> #include <linux/icmpv6.h> #include <linux/types.h> #include <linux/kernel.h> #include <net/fou.h> #include <net/ip.h> #include <net/ip6_tunnel.h> #include <net/ip6_checksum.h> #include <net/protocol.h> #include <net/udp.h> #include <net/udp_tunnel.h> #if IS_ENABLED(CONFIG_IPV6_FOU_TUNNEL) static void fou6_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e, struct flowi6 *fl6, u8 *protocol, __be16 sport) { struct udphdr *uh; skb_push(skb, sizeof(struct udphdr)); skb_reset_transport_header(skb); uh = udp_hdr(skb); uh->dest = e->dport; uh->source = sport; uh->len = htons(skb->len); udp6_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM6), skb, &fl6->saddr, &fl6->daddr, skb->len); *protocol = IPPROTO_UDP; } static int fou6_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, struct flowi6 *fl6) { __be16 sport; int err; int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM6 ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; err = __fou_build_header(skb, e, protocol, &sport, type); if (err) return err; fou6_build_udp(skb, e, fl6, protocol, sport); return 0; } static int gue6_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, struct flowi6 *fl6) { __be16 sport; int err; int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM6 ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; err = __gue_build_header(skb, e, protocol, &sport, type); if (err) return err; fou6_build_udp(skb, e, fl6, protocol, sport); return 0; } static int gue6_err_proto_handler(int proto, struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { const struct inet6_protocol *ipprot; ipprot = rcu_dereference(inet6_protos[proto]); if (ipprot && ipprot->err_handler) { if (!ipprot->err_handler(skb, opt, type, code, offset, info)) return 0; } return -ENOENT; } static int gue6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { int transport_offset = skb_transport_offset(skb); struct guehdr *guehdr; size_t len, optlen; int ret; len = sizeof(struct udphdr) + sizeof(struct guehdr); if (!pskb_may_pull(skb, transport_offset + len)) return -EINVAL; guehdr = (struct guehdr *)&udp_hdr(skb)[1]; switch (guehdr->version) { case 0: /* Full GUE header present */ break; case 1: { /* Direct encasulation of IPv4 or IPv6 */ skb_set_transport_header(skb, -(int)sizeof(struct icmp6hdr)); switch (((struct iphdr *)guehdr)->version) { case 4: ret = gue6_err_proto_handler(IPPROTO_IPIP, skb, opt, type, code, offset, info); goto out; case 6: ret = gue6_err_proto_handler(IPPROTO_IPV6, skb, opt, type, code, offset, info); goto out; default: ret = -EOPNOTSUPP; goto out; } } default: /* Undefined version */ return -EOPNOTSUPP; } if (guehdr->control) return -ENOENT; optlen = guehdr->hlen << 2; if (!pskb_may_pull(skb, transport_offset + len + optlen)) return -EINVAL; guehdr = (struct guehdr *)&udp_hdr(skb)[1]; if (validate_gue_flags(guehdr, optlen)) return -EINVAL; /* Handling exceptions for direct UDP encapsulation in GUE would lead to * recursion. Besides, this kind of encapsulation can't even be * configured currently. Discard this. */ if (guehdr->proto_ctype == IPPROTO_UDP || guehdr->proto_ctype == IPPROTO_UDPLITE) return -EOPNOTSUPP; skb_set_transport_header(skb, -(int)sizeof(struct icmp6hdr)); ret = gue6_err_proto_handler(guehdr->proto_ctype, skb, opt, type, code, offset, info); out: skb_set_transport_header(skb, transport_offset); return ret; } static const struct ip6_tnl_encap_ops fou_ip6tun_ops = { .encap_hlen = fou_encap_hlen, .build_header = fou6_build_header, .err_handler = gue6_err, }; static const struct ip6_tnl_encap_ops gue_ip6tun_ops = { .encap_hlen = gue_encap_hlen, .build_header = gue6_build_header, .err_handler = gue6_err, }; static int ip6_tnl_encap_add_fou_ops(void) { int ret; ret = ip6_tnl_encap_add_ops(&fou_ip6tun_ops, TUNNEL_ENCAP_FOU); if (ret < 0) { pr_err("can't add fou6 ops\n"); return ret; } ret = ip6_tnl_encap_add_ops(&gue_ip6tun_ops, TUNNEL_ENCAP_GUE); if (ret < 0) { pr_err("can't add gue6 ops\n"); ip6_tnl_encap_del_ops(&fou_ip6tun_ops, TUNNEL_ENCAP_FOU); return ret; } return 0; } static void ip6_tnl_encap_del_fou_ops(void) { ip6_tnl_encap_del_ops(&fou_ip6tun_ops, TUNNEL_ENCAP_FOU); ip6_tnl_encap_del_ops(&gue_ip6tun_ops, TUNNEL_ENCAP_GUE); } #else static int ip6_tnl_encap_add_fou_ops(void) { return 0; } static void ip6_tnl_encap_del_fou_ops(void) { } #endif static int __init fou6_init(void) { int ret; ret = ip6_tnl_encap_add_fou_ops(); return ret; } static void __exit fou6_fini(void) { ip6_tnl_encap_del_fou_ops(); } module_init(fou6_init); module_exit(fou6_fini); MODULE_AUTHOR("Tom Herbert <therbert@google.com>"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Foo over UDP (IPv6)");
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef __DSA_TAG_H #define __DSA_TAG_H #include <linux/if_vlan.h> #include <linux/list.h> #include <linux/types.h> #include <net/dsa.h> #include "port.h" #include "user.h" struct dsa_tag_driver { const struct dsa_device_ops *ops; struct list_head list; struct module *owner; }; extern struct packet_type dsa_pack_type; const struct dsa_device_ops *dsa_tag_driver_get_by_id(int tag_protocol); const struct dsa_device_ops *dsa_tag_driver_get_by_name(const char *name); void dsa_tag_driver_put(const struct dsa_device_ops *ops); const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops); static inline int dsa_tag_protocol_overhead(const struct dsa_device_ops *ops) { return ops->needed_headroom + ops->needed_tailroom; } static inline struct net_device *dsa_conduit_find_user(struct net_device *dev, int device, int port) { struct dsa_port *cpu_dp = dev->dsa_ptr; struct dsa_switch_tree *dst = cpu_dp->dst; struct dsa_port *dp; list_for_each_entry(dp, &dst->ports, list) if (dp->ds->index == device && dp->index == port && dp->type == DSA_PORT_TYPE_USER) return dp->user; return NULL; } /** * dsa_software_untag_vlan_aware_bridge: Software untagging for VLAN-aware bridge * @skb: Pointer to received socket buffer (packet) * @br: Pointer to bridge upper interface of ingress port * @vid: Parsed VID from packet * * The bridge can process tagged packets. Software like STP/PTP may not. The * bridge can also process untagged packets, to the same effect as if they were * tagged with the PVID of the ingress port. So packets tagged with the PVID of * the bridge port must be software-untagged, to support both use cases. */ static inline void dsa_software_untag_vlan_aware_bridge(struct sk_buff *skb, struct net_device *br, u16 vid) { u16 pvid, proto; int err; err = br_vlan_get_proto(br, &proto); if (err) return; err = br_vlan_get_pvid_rcu(skb->dev, &pvid); if (err) return; if (vid == pvid && skb->vlan_proto == htons(proto)) __vlan_hwaccel_clear_tag(skb); } /** * dsa_software_untag_vlan_unaware_bridge: Software untagging for VLAN-unaware bridge * @skb: Pointer to received socket buffer (packet) * @br: Pointer to bridge upper interface of ingress port * @vid: Parsed VID from packet * * The bridge ignores all VLAN tags. Software like STP/PTP may not (it may run * on the plain port, or on a VLAN upper interface). Maybe packets are coming * to software as tagged with a driver-defined VID which is NOT equal to the * PVID of the bridge port (since the bridge is VLAN-unaware, its configuration * should NOT be committed to hardware). DSA needs a method for this private * VID to be communicated by software to it, and if packets are tagged with it, * software-untag them. Note: the private VID may be different per bridge, to * support the FDB isolation use case. * * FIXME: this is currently implemented based on the broken assumption that * the "private VID" used by the driver in VLAN-unaware mode is equal to the * bridge PVID. It should not be, except for a coincidence; the bridge PVID is * irrelevant to the data path in the VLAN-unaware mode. Thus, the VID that * this function removes is wrong. * * All users of ds->untag_bridge_pvid should fix their drivers, if necessary, * to make the two independent. Only then, if there still remains a need to * strip the private VID from packets, then a new ds->ops->get_private_vid() * API shall be introduced to communicate to DSA what this VID is, which needs * to be stripped here. */ static inline void dsa_software_untag_vlan_unaware_bridge(struct sk_buff *skb, struct net_device *br, u16 vid) { struct net_device *upper_dev; u16 pvid, proto; int err; err = br_vlan_get_proto(br, &proto); if (err) return; err = br_vlan_get_pvid_rcu(skb->dev, &pvid); if (err) return; if (vid != pvid || skb->vlan_proto != htons(proto)) return; /* The sad part about attempting to untag from DSA is that we * don't know, unless we check, if the skb will end up in * the bridge's data path - br_allowed_ingress() - or not. * For example, there might be an 8021q upper for the * default_pvid of the bridge, which will steal VLAN-tagged traffic * from the bridge's data path. This is a configuration that DSA * supports because vlan_filtering is 0. In that case, we should * definitely keep the tag, to make sure it keeps working. */ upper_dev = __vlan_find_dev_deep_rcu(br, htons(proto), vid); if (!upper_dev) __vlan_hwaccel_clear_tag(skb); } /** * dsa_software_vlan_untag: Software VLAN untagging in DSA receive path * @skb: Pointer to socket buffer (packet) * * Receive path method for switches which send some packets as VLAN-tagged * towards the CPU port (generally from VLAN-aware bridge ports) even when the * packet was not tagged on the wire. Called when ds->untag_bridge_pvid * (legacy) or ds->untag_vlan_aware_bridge_pvid is set to true. * * As a side effect of this method, any VLAN tag from the skb head is moved * to hwaccel. */ static inline struct sk_buff *dsa_software_vlan_untag(struct sk_buff *skb) { struct dsa_port *dp = dsa_user_to_port(skb->dev); struct net_device *br = dsa_port_bridge_dev_get(dp); u16 vid, proto; int err; /* software untagging for standalone ports not yet necessary */ if (!br) return skb; err = br_vlan_get_proto(br, &proto); if (err) return skb; /* Move VLAN tag from data to hwaccel */ if (!skb_vlan_tag_present(skb) && skb->protocol == htons(proto)) { skb = skb_vlan_untag(skb); if (!skb) return NULL; } if (!skb_vlan_tag_present(skb)) return skb; vid = skb_vlan_tag_get_id(skb); if (br_vlan_enabled(br)) { if (dp->ds->untag_vlan_aware_bridge_pvid) dsa_software_untag_vlan_aware_bridge(skb, br, vid); } else { if (dp->ds->untag_bridge_pvid) dsa_software_untag_vlan_unaware_bridge(skb, br, vid); } return skb; } /* For switches without hardware support for DSA tagging to be able * to support termination through the bridge. */ static inline struct net_device * dsa_find_designated_bridge_port_by_vid(struct net_device *conduit, u16 vid) { struct dsa_port *cpu_dp = conduit->dsa_ptr; struct dsa_switch_tree *dst = cpu_dp->dst; struct bridge_vlan_info vinfo; struct net_device *user; struct dsa_port *dp; int err; list_for_each_entry(dp, &dst->ports, list) { if (dp->type != DSA_PORT_TYPE_USER) continue; if (!dp->bridge) continue; if (dp->stp_state != BR_STATE_LEARNING && dp->stp_state != BR_STATE_FORWARDING) continue; /* Since the bridge might learn this packet, keep the CPU port * affinity with the port that will be used for the reply on * xmit. */ if (dp->cpu_dp != cpu_dp) continue; user = dp->user; err = br_vlan_get_info_rcu(user, vid, &vinfo); if (err) continue; return user; } return NULL; } /* If the ingress port offloads the bridge, we mark the frame as autonomously * forwarded by hardware, so the software bridge doesn't forward in twice, back * to us, because we already did. However, if we're in fallback mode and we do * software bridging, we are not offloading it, therefore the dp->bridge * pointer is not populated, and flooding needs to be done by software (we are * effectively operating in standalone ports mode). */ static inline void dsa_default_offload_fwd_mark(struct sk_buff *skb) { struct dsa_port *dp = dsa_user_to_port(skb->dev); skb->offload_fwd_mark = !!(dp->bridge); } /* Helper for removing DSA header tags from packets in the RX path. * Must not be called before skb_pull(len). * skb->data * | * v * | | | | | | | | | | | | | | | | | | | * +-----------------------+-----------------------+---------------+-------+ * | Destination MAC | Source MAC | DSA header | EType | * +-----------------------+-----------------------+---------------+-------+ * | | * <----- len -----> <----- len -----> * | * >>>>>>> v * >>>>>>> | | | | | | | | | | | | | | | * >>>>>>> +-----------------------+-----------------------+-------+ * >>>>>>> | Destination MAC | Source MAC | EType | * +-----------------------+-----------------------+-------+ * ^ * | * skb->data */ static inline void dsa_strip_etype_header(struct sk_buff *skb, int len) { memmove(skb->data - ETH_HLEN, skb->data - ETH_HLEN - len, 2 * ETH_ALEN); } /* Helper for creating space for DSA header tags in TX path packets. * Must not be called before skb_push(len). * * Before: * * <<<<<<< | | | | | | | | | | | | | | | * ^ <<<<<<< +-----------------------+-----------------------+-------+ * | <<<<<<< | Destination MAC | Source MAC | EType | * | +-----------------------+-----------------------+-------+ * <----- len -----> * | * | * skb->data * * After: * * | | | | | | | | | | | | | | | | | | | * +-----------------------+-----------------------+---------------+-------+ * | Destination MAC | Source MAC | DSA header | EType | * +-----------------------+-----------------------+---------------+-------+ * ^ | | * | <----- len -----> * skb->data */ static inline void dsa_alloc_etype_header(struct sk_buff *skb, int len) { memmove(skb->data, skb->data + len, 2 * ETH_ALEN); } /* On RX, eth_type_trans() on the DSA conduit pulls ETH_HLEN bytes starting from * skb_mac_header(skb), which leaves skb->data pointing at the first byte after * what the DSA conduit perceives as the EtherType (the beginning of the L3 * protocol). Since DSA EtherType header taggers treat the EtherType as part of * the DSA tag itself, and the EtherType is 2 bytes in length, the DSA header * is located 2 bytes behind skb->data. Note that EtherType in this context * means the first 2 bytes of the DSA header, not the encapsulated EtherType * that will become visible after the DSA header is stripped. */ static inline void *dsa_etype_header_pos_rx(struct sk_buff *skb) { return skb->data - 2; } /* On TX, skb->data points to the MAC header, which means that EtherType * header taggers start exactly where the EtherType is (the EtherType is * treated as part of the DSA header). */ static inline void *dsa_etype_header_pos_tx(struct sk_buff *skb) { return skb->data + 2 * ETH_ALEN; } /* Create 2 modaliases per tagging protocol, one to auto-load the module * given the ID reported by get_tag_protocol(), and the other by name. */ #define DSA_TAG_DRIVER_ALIAS "dsa_tag:" #define MODULE_ALIAS_DSA_TAG_DRIVER(__proto, __name) \ MODULE_ALIAS(DSA_TAG_DRIVER_ALIAS __name); \ MODULE_ALIAS(DSA_TAG_DRIVER_ALIAS "id-" \ __stringify(__proto##_VALUE)) void dsa_tag_drivers_register(struct dsa_tag_driver *dsa_tag_driver_array[], unsigned int count, struct module *owner); void dsa_tag_drivers_unregister(struct dsa_tag_driver *dsa_tag_driver_array[], unsigned int count); #define dsa_tag_driver_module_drivers(__dsa_tag_drivers_array, __count) \ static int __init dsa_tag_driver_module_init(void) \ { \ dsa_tag_drivers_register(__dsa_tag_drivers_array, __count, \ THIS_MODULE); \ return 0; \ } \ module_init(dsa_tag_driver_module_init); \ \ static void __exit dsa_tag_driver_module_exit(void) \ { \ dsa_tag_drivers_unregister(__dsa_tag_drivers_array, __count); \ } \ module_exit(dsa_tag_driver_module_exit) /** * module_dsa_tag_drivers() - Helper macro for registering DSA tag * drivers * @__ops_array: Array of tag driver structures * * Helper macro for DSA tag drivers which do not do anything special * in module init/exit. Each module may only use this macro once, and * calling it replaces module_init() and module_exit(). */ #define module_dsa_tag_drivers(__ops_array) \ dsa_tag_driver_module_drivers(__ops_array, ARRAY_SIZE(__ops_array)) #define DSA_TAG_DRIVER_NAME(__ops) dsa_tag_driver ## _ ## __ops /* Create a static structure we can build a linked list of dsa_tag * drivers */ #define DSA_TAG_DRIVER(__ops) \ static struct dsa_tag_driver DSA_TAG_DRIVER_NAME(__ops) = { \ .ops = &__ops, \ } /** * module_dsa_tag_driver() - Helper macro for registering a single DSA tag * driver * @__ops: Single tag driver structures * * Helper macro for DSA tag drivers which do not do anything special * in module init/exit. Each module may only use this macro once, and * calling it replaces module_init() and module_exit(). */ #define module_dsa_tag_driver(__ops) \ DSA_TAG_DRIVER(__ops); \ \ static struct dsa_tag_driver *dsa_tag_driver_array[] = { \ &DSA_TAG_DRIVER_NAME(__ops) \ }; \ module_dsa_tag_drivers(dsa_tag_driver_array) #endif
165 176 161 20 2 162 176 166 10 10 10 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_HIGHMEM_H #define _LINUX_HIGHMEM_H #include <linux/fs.h> #include <linux/kernel.h> #include <linux/bug.h> #include <linux/cacheflush.h> #include <linux/kmsan.h> #include <linux/mm.h> #include <linux/uaccess.h> #include <linux/hardirq.h> #include "highmem-internal.h" /** * kmap - Map a page for long term usage * @page: Pointer to the page to be mapped * * Returns: The virtual address of the mapping * * Can only be invoked from preemptible task context because on 32bit * systems with CONFIG_HIGHMEM enabled this function might sleep. * * For systems with CONFIG_HIGHMEM=n and for pages in the low memory area * this returns the virtual address of the direct kernel mapping. * * The returned virtual address is globally visible and valid up to the * point where it is unmapped via kunmap(). The pointer can be handed to * other contexts. * * For highmem pages on 32bit systems this can be slow as the mapping space * is limited and protected by a global lock. In case that there is no * mapping slot available the function blocks until a slot is released via * kunmap(). */ static inline void *kmap(struct page *page); /** * kunmap - Unmap the virtual address mapped by kmap() * @page: Pointer to the page which was mapped by kmap() * * Counterpart to kmap(). A NOOP for CONFIG_HIGHMEM=n and for mappings of * pages in the low memory area. */ static inline void kunmap(struct page *page); /** * kmap_to_page - Get the page for a kmap'ed address * @addr: The address to look up * * Returns: The page which is mapped to @addr. */ static inline struct page *kmap_to_page(void *addr); /** * kmap_flush_unused - Flush all unused kmap mappings in order to * remove stray mappings */ static inline void kmap_flush_unused(void); /** * kmap_local_page - Map a page for temporary usage * @page: Pointer to the page to be mapped * * Returns: The virtual address of the mapping * * Can be invoked from any context, including interrupts. * * Requires careful handling when nesting multiple mappings because the map * management is stack based. The unmap has to be in the reverse order of * the map operation: * * addr1 = kmap_local_page(page1); * addr2 = kmap_local_page(page2); * ... * kunmap_local(addr2); * kunmap_local(addr1); * * Unmapping addr1 before addr2 is invalid and causes malfunction. * * Contrary to kmap() mappings the mapping is only valid in the context of * the caller and cannot be handed to other contexts. * * On CONFIG_HIGHMEM=n kernels and for low memory pages this returns the * virtual address of the direct mapping. Only real highmem pages are * temporarily mapped. * * While kmap_local_page() is significantly faster than kmap() for the highmem * case it comes with restrictions about the pointer validity. * * On HIGHMEM enabled systems mapping a highmem page has the side effect of * disabling migration in order to keep the virtual address stable across * preemption. No caller of kmap_local_page() can rely on this side effect. */ static inline void *kmap_local_page(struct page *page); /** * kmap_local_folio - Map a page in this folio for temporary usage * @folio: The folio containing the page. * @offset: The byte offset within the folio which identifies the page. * * Requires careful handling when nesting multiple mappings because the map * management is stack based. The unmap has to be in the reverse order of * the map operation:: * * addr1 = kmap_local_folio(folio1, offset1); * addr2 = kmap_local_folio(folio2, offset2); * ... * kunmap_local(addr2); * kunmap_local(addr1); * * Unmapping addr1 before addr2 is invalid and causes malfunction. * * Contrary to kmap() mappings the mapping is only valid in the context of * the caller and cannot be handed to other contexts. * * On CONFIG_HIGHMEM=n kernels and for low memory pages this returns the * virtual address of the direct mapping. Only real highmem pages are * temporarily mapped. * * While it is significantly faster than kmap() for the highmem case it * comes with restrictions about the pointer validity. * * On HIGHMEM enabled systems mapping a highmem page has the side effect of * disabling migration in order to keep the virtual address stable across * preemption. No caller of kmap_local_folio() can rely on this side effect. * * Context: Can be invoked from any context. * Return: The virtual address of @offset. */ static inline void *kmap_local_folio(struct folio *folio, size_t offset); /** * kmap_atomic - Atomically map a page for temporary usage - Deprecated! * @page: Pointer to the page to be mapped * * Returns: The virtual address of the mapping * * In fact a wrapper around kmap_local_page() which also disables pagefaults * and, depending on PREEMPT_RT configuration, also CPU migration and * preemption. Therefore users should not count on the latter two side effects. * * Mappings should always be released by kunmap_atomic(). * * Do not use in new code. Use kmap_local_page() instead. * * It is used in atomic context when code wants to access the contents of a * page that might be allocated from high memory (see __GFP_HIGHMEM), for * example a page in the pagecache. The API has two functions, and they * can be used in a manner similar to the following:: * * // Find the page of interest. * struct page *page = find_get_page(mapping, offset); * * // Gain access to the contents of that page. * void *vaddr = kmap_atomic(page); * * // Do something to the contents of that page. * memset(vaddr, 0, PAGE_SIZE); * * // Unmap that page. * kunmap_atomic(vaddr); * * Note that the kunmap_atomic() call takes the result of the kmap_atomic() * call, not the argument. * * If you need to map two pages because you want to copy from one page to * another you need to keep the kmap_atomic calls strictly nested, like: * * vaddr1 = kmap_atomic(page1); * vaddr2 = kmap_atomic(page2); * * memcpy(vaddr1, vaddr2, PAGE_SIZE); * * kunmap_atomic(vaddr2); * kunmap_atomic(vaddr1); */ static inline void *kmap_atomic(struct page *page); /* Highmem related interfaces for management code */ static inline unsigned long nr_free_highpages(void); static inline unsigned long totalhigh_pages(void); #ifndef ARCH_HAS_FLUSH_ANON_PAGE static inline void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vmaddr) { } #endif #ifndef ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE static inline void flush_kernel_vmap_range(void *vaddr, int size) { } static inline void invalidate_kernel_vmap_range(void *vaddr, int size) { } #endif /* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */ #ifndef clear_user_highpage static inline void clear_user_highpage(struct page *page, unsigned long vaddr) { void *addr = kmap_local_page(page); clear_user_page(addr, vaddr, page); kunmap_local(addr); } #endif #ifndef vma_alloc_zeroed_movable_folio /** * vma_alloc_zeroed_movable_folio - Allocate a zeroed page for a VMA. * @vma: The VMA the page is to be allocated for. * @vaddr: The virtual address the page will be inserted into. * * This function will allocate a page suitable for inserting into this * VMA at this virtual address. It may be allocated from highmem or * the movable zone. An architecture may provide its own implementation. * * Return: A folio containing one allocated and zeroed page or NULL if * we are out of memory. */ static inline struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, unsigned long vaddr) { struct folio *folio; folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vaddr); if (folio && user_alloc_needs_zeroing()) clear_user_highpage(&folio->page, vaddr); return folio; } #endif static inline void clear_highpage(struct page *page) { void *kaddr = kmap_local_page(page); clear_page(kaddr); kunmap_local(kaddr); } static inline void clear_highpage_kasan_tagged(struct page *page) { void *kaddr = kmap_local_page(page); clear_page(kasan_reset_tag(kaddr)); kunmap_local(kaddr); } #ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGE static inline void tag_clear_highpage(struct page *page) { } #endif /* * If we pass in a base or tail page, we can zero up to PAGE_SIZE. * If we pass in a head page, we can zero up to the size of the compound page. */ #ifdef CONFIG_HIGHMEM void zero_user_segments(struct page *page, unsigned start1, unsigned end1, unsigned start2, unsigned end2); #else static inline void zero_user_segments(struct page *page, unsigned start1, unsigned end1, unsigned start2, unsigned end2) { void *kaddr = kmap_local_page(page); unsigned int i; BUG_ON(end1 > page_size(page) || end2 > page_size(page)); if (end1 > start1) memset(kaddr + start1, 0, end1 - start1); if (end2 > start2) memset(kaddr + start2, 0, end2 - start2); kunmap_local(kaddr); for (i = 0; i < compound_nr(page); i++) flush_dcache_page(page + i); } #endif static inline void zero_user_segment(struct page *page, unsigned start, unsigned end) { zero_user_segments(page, start, end, 0, 0); } static inline void zero_user(struct page *page, unsigned start, unsigned size) { zero_user_segments(page, start, start + size, 0, 0); } #ifndef __HAVE_ARCH_COPY_USER_HIGHPAGE static inline void copy_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma) { char *vfrom, *vto; vfrom = kmap_local_page(from); vto = kmap_local_page(to); copy_user_page(vto, vfrom, vaddr, to); kmsan_unpoison_memory(page_address(to), PAGE_SIZE); kunmap_local(vto); kunmap_local(vfrom); } #endif #ifndef __HAVE_ARCH_COPY_HIGHPAGE static inline void copy_highpage(struct page *to, struct page *from) { char *vfrom, *vto; vfrom = kmap_local_page(from); vto = kmap_local_page(to); copy_page(vto, vfrom); kmsan_copy_page_meta(to, from); kunmap_local(vto); kunmap_local(vfrom); } #endif #ifdef copy_mc_to_kernel /* * If architecture supports machine check exception handling, define the * #MC versions of copy_user_highpage and copy_highpage. They copy a memory * page with #MC in source page (@from) handled, and return the number * of bytes not copied if there was a #MC, otherwise 0 for success. */ static inline int copy_mc_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma) { unsigned long ret; char *vfrom, *vto; vfrom = kmap_local_page(from); vto = kmap_local_page(to); ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE); if (!ret) kmsan_unpoison_memory(page_address(to), PAGE_SIZE); kunmap_local(vto); kunmap_local(vfrom); if (ret) memory_failure_queue(page_to_pfn(from), 0); return ret; } static inline int copy_mc_highpage(struct page *to, struct page *from) { unsigned long ret; char *vfrom, *vto; vfrom = kmap_local_page(from); vto = kmap_local_page(to); ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE); if (!ret) kmsan_copy_page_meta(to, from); kunmap_local(vto); kunmap_local(vfrom); if (ret) memory_failure_queue(page_to_pfn(from), 0); return ret; } #else static inline int copy_mc_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma) { copy_user_highpage(to, from, vaddr, vma); return 0; } static inline int copy_mc_highpage(struct page *to, struct page *from) { copy_highpage(to, from); return 0; } #endif static inline void memcpy_page(struct page *dst_page, size_t dst_off, struct page *src_page, size_t src_off, size_t len) { char *dst = kmap_local_page(dst_page); char *src = kmap_local_page(src_page); VM_BUG_ON(dst_off + len > PAGE_SIZE || src_off + len > PAGE_SIZE); memcpy(dst + dst_off, src + src_off, len); kunmap_local(src); kunmap_local(dst); } static inline void memset_page(struct page *page, size_t offset, int val, size_t len) { char *addr = kmap_local_page(page); VM_BUG_ON(offset + len > PAGE_SIZE); memset(addr + offset, val, len); kunmap_local(addr); } static inline void memcpy_from_page(char *to, struct page *page, size_t offset, size_t len) { char *from = kmap_local_page(page); VM_BUG_ON(offset + len > PAGE_SIZE); memcpy(to, from + offset, len); kunmap_local(from); } static inline void memcpy_to_page(struct page *page, size_t offset, const char *from, size_t len) { char *to = kmap_local_page(page); VM_BUG_ON(offset + len > PAGE_SIZE); memcpy(to + offset, from, len); flush_dcache_page(page); kunmap_local(to); } static inline void memzero_page(struct page *page, size_t offset, size_t len) { char *addr = kmap_local_page(page); VM_BUG_ON(offset + len > PAGE_SIZE); memset(addr + offset, 0, len); flush_dcache_page(page); kunmap_local(addr); } /** * memcpy_from_folio - Copy a range of bytes from a folio. * @to: The memory to copy to. * @folio: The folio to read from. * @offset: The first byte in the folio to read. * @len: The number of bytes to copy. */ static inline void memcpy_from_folio(char *to, struct folio *folio, size_t offset, size_t len) { VM_BUG_ON(offset + len > folio_size(folio)); do { const char *from = kmap_local_folio(folio, offset); size_t chunk = len; if (folio_test_highmem(folio) && chunk > PAGE_SIZE - offset_in_page(offset)) chunk = PAGE_SIZE - offset_in_page(offset); memcpy(to, from, chunk); kunmap_local(from); to += chunk; offset += chunk; len -= chunk; } while (len > 0); } /** * memcpy_to_folio - Copy a range of bytes to a folio. * @folio: The folio to write to. * @offset: The first byte in the folio to store to. * @from: The memory to copy from. * @len: The number of bytes to copy. */ static inline void memcpy_to_folio(struct folio *folio, size_t offset, const char *from, size_t len) { VM_BUG_ON(offset + len > folio_size(folio)); do { char *to = kmap_local_folio(folio, offset); size_t chunk = len; if (folio_test_highmem(folio) && chunk > PAGE_SIZE - offset_in_page(offset)) chunk = PAGE_SIZE - offset_in_page(offset); memcpy(to, from, chunk); kunmap_local(to); from += chunk; offset += chunk; len -= chunk; } while (len > 0); flush_dcache_folio(folio); } /** * folio_zero_tail - Zero the tail of a folio. * @folio: The folio to zero. * @offset: The byte offset in the folio to start zeroing at. * @kaddr: The address the folio is currently mapped to. * * If you have already used kmap_local_folio() to map a folio, written * some data to it and now need to zero the end of the folio (and flush * the dcache), you can use this function. If you do not have the * folio kmapped (eg the folio has been partially populated by DMA), * use folio_zero_range() or folio_zero_segment() instead. * * Return: An address which can be passed to kunmap_local(). */ static inline __must_check void *folio_zero_tail(struct folio *folio, size_t offset, void *kaddr) { size_t len = folio_size(folio) - offset; if (folio_test_highmem(folio)) { size_t max = PAGE_SIZE - offset_in_page(offset); while (len > max) { memset(kaddr, 0, max); kunmap_local(kaddr); len -= max; offset += max; max = PAGE_SIZE; kaddr = kmap_local_folio(folio, offset); } } memset(kaddr, 0, len); flush_dcache_folio(folio); return kaddr; } /** * folio_fill_tail - Copy some data to a folio and pad with zeroes. * @folio: The destination folio. * @offset: The offset into @folio at which to start copying. * @from: The data to copy. * @len: How many bytes of data to copy. * * This function is most useful for filesystems which support inline data. * When they want to copy data from the inode into the page cache, this * function does everything for them. It supports large folios even on * HIGHMEM configurations. */ static inline void folio_fill_tail(struct folio *folio, size_t offset, const char *from, size_t len) { char *to = kmap_local_folio(folio, offset); VM_BUG_ON(offset + len > folio_size(folio)); if (folio_test_highmem(folio)) { size_t max = PAGE_SIZE - offset_in_page(offset); while (len > max) { memcpy(to, from, max); kunmap_local(to); len -= max; from += max; offset += max; max = PAGE_SIZE; to = kmap_local_folio(folio, offset); } } memcpy(to, from, len); to = folio_zero_tail(folio, offset + len, to + len); kunmap_local(to); } /** * memcpy_from_file_folio - Copy some bytes from a file folio. * @to: The destination buffer. * @folio: The folio to copy from. * @pos: The position in the file. * @len: The maximum number of bytes to copy. * * Copy up to @len bytes from this folio. This may be limited by PAGE_SIZE * if the folio comes from HIGHMEM, and by the size of the folio. * * Return: The number of bytes copied from the folio. */ static inline size_t memcpy_from_file_folio(char *to, struct folio *folio, loff_t pos, size_t len) { size_t offset = offset_in_folio(folio, pos); char *from = kmap_local_folio(folio, offset); if (folio_test_highmem(folio)) { offset = offset_in_page(offset); len = min_t(size_t, len, PAGE_SIZE - offset); } else len = min(len, folio_size(folio) - offset); memcpy(to, from, len); kunmap_local(from); return len; } /** * folio_zero_segments() - Zero two byte ranges in a folio. * @folio: The folio to write to. * @start1: The first byte to zero. * @xend1: One more than the last byte in the first range. * @start2: The first byte to zero in the second range. * @xend2: One more than the last byte in the second range. */ static inline void folio_zero_segments(struct folio *folio, size_t start1, size_t xend1, size_t start2, size_t xend2) { zero_user_segments(&folio->page, start1, xend1, start2, xend2); } /** * folio_zero_segment() - Zero a byte range in a folio. * @folio: The folio to write to. * @start: The first byte to zero. * @xend: One more than the last byte to zero. */ static inline void folio_zero_segment(struct folio *folio, size_t start, size_t xend) { zero_user_segments(&folio->page, start, xend, 0, 0); } /** * folio_zero_range() - Zero a byte range in a folio. * @folio: The folio to write to. * @start: The first byte to zero. * @length: The number of bytes to zero. */ static inline void folio_zero_range(struct folio *folio, size_t start, size_t length) { zero_user_segments(&folio->page, start, start + length, 0, 0); } /** * folio_release_kmap - Unmap a folio and drop a refcount. * @folio: The folio to release. * @addr: The address previously returned by a call to kmap_local_folio(). * * It is common, eg in directory handling to kmap a folio. This function * unmaps the folio and drops the refcount that was being held to keep the * folio alive while we accessed it. */ static inline void folio_release_kmap(struct folio *folio, void *addr) { kunmap_local(addr); folio_put(folio); } static inline void unmap_and_put_page(struct page *page, void *addr) { folio_release_kmap(page_folio(page), addr); } #endif /* _LINUX_HIGHMEM_H */
3749 3748 3746 3745 3748 3741 3744 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 // SPDX-License-Identifier: GPL-2.0 /* * SHA1 routine optimized to do word accesses rather than byte accesses, * and to avoid unnecessary copies into the context array. * * This was based on the git SHA1 implementation. */ #include <linux/kernel.h> #include <linux/export.h> #include <linux/module.h> #include <linux/bitops.h> #include <linux/string.h> #include <crypto/sha1.h> #include <linux/unaligned.h> /* * If you have 32 registers or more, the compiler can (and should) * try to change the array[] accesses into registers. However, on * machines with less than ~25 registers, that won't really work, * and at least gcc will make an unholy mess of it. * * So to avoid that mess which just slows things down, we force * the stores to memory to actually happen (we might be better off * with a 'W(t)=(val);asm("":"+m" (W(t))' there instead, as * suggested by Artur Skawina - that will also make gcc unable to * try to do the silly "optimize away loads" part because it won't * see what the value will be). * * Ben Herrenschmidt reports that on PPC, the C version comes close * to the optimized asm with this (ie on PPC you don't want that * 'volatile', since there are lots of registers). * * On ARM we get the best code generation by forcing a full memory barrier * between each SHA_ROUND, otherwise gcc happily get wild with spilling and * the stack frame size simply explode and performance goes down the drain. */ #ifdef CONFIG_X86 #define setW(x, val) (*(volatile __u32 *)&W(x) = (val)) #elif defined(CONFIG_ARM) #define setW(x, val) do { W(x) = (val); __asm__("":::"memory"); } while (0) #else #define setW(x, val) (W(x) = (val)) #endif /* This "rolls" over the 512-bit array */ #define W(x) (array[(x)&15]) /* * Where do we get the source from? The first 16 iterations get it from * the input data, the next mix it from the 512-bit array. */ #define SHA_SRC(t) get_unaligned_be32((__u32 *)data + t) #define SHA_MIX(t) rol32(W(t+13) ^ W(t+8) ^ W(t+2) ^ W(t), 1) #define SHA_ROUND(t, input, fn, constant, A, B, C, D, E) do { \ __u32 TEMP = input(t); setW(t, TEMP); \ E += TEMP + rol32(A,5) + (fn) + (constant); \ B = ror32(B, 2); \ TEMP = E; E = D; D = C; C = B; B = A; A = TEMP; } while (0) #define T_0_15(t, A, B, C, D, E) SHA_ROUND(t, SHA_SRC, (((C^D)&B)^D) , 0x5a827999, A, B, C, D, E ) #define T_16_19(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (((C^D)&B)^D) , 0x5a827999, A, B, C, D, E ) #define T_20_39(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (B^C^D) , 0x6ed9eba1, A, B, C, D, E ) #define T_40_59(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, ((B&C)+(D&(B^C))) , 0x8f1bbcdc, A, B, C, D, E ) #define T_60_79(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (B^C^D) , 0xca62c1d6, A, B, C, D, E ) /** * sha1_transform - single block SHA1 transform (deprecated) * * @digest: 160 bit digest to update * @data: 512 bits of data to hash * @array: 16 words of workspace (see note) * * This function executes SHA-1's internal compression function. It updates the * 160-bit internal state (@digest) with a single 512-bit data block (@data). * * Don't use this function. SHA-1 is no longer considered secure. And even if * you do have to use SHA-1, this isn't the correct way to hash something with * SHA-1 as this doesn't handle padding and finalization. * * Note: If the hash is security sensitive, the caller should be sure * to clear the workspace. This is left to the caller to avoid * unnecessary clears between chained hashing operations. */ void sha1_transform(__u32 *digest, const char *data, __u32 *array) { __u32 A, B, C, D, E; unsigned int i = 0; A = digest[0]; B = digest[1]; C = digest[2]; D = digest[3]; E = digest[4]; /* Round 1 - iterations 0-16 take their input from 'data' */ for (; i < 16; ++i) T_0_15(i, A, B, C, D, E); /* Round 1 - tail. Input from 512-bit mixing array */ for (; i < 20; ++i) T_16_19(i, A, B, C, D, E); /* Round 2 */ for (; i < 40; ++i) T_20_39(i, A, B, C, D, E); /* Round 3 */ for (; i < 60; ++i) T_40_59(i, A, B, C, D, E); /* Round 4 */ for (; i < 80; ++i) T_60_79(i, A, B, C, D, E); digest[0] += A; digest[1] += B; digest[2] += C; digest[3] += D; digest[4] += E; } EXPORT_SYMBOL(sha1_transform); /** * sha1_init - initialize the vectors for a SHA1 digest * @buf: vector to initialize */ void sha1_init(__u32 *buf) { buf[0] = 0x67452301; buf[1] = 0xefcdab89; buf[2] = 0x98badcfe; buf[3] = 0x10325476; buf[4] = 0xc3d2e1f0; } EXPORT_SYMBOL(sha1_init); MODULE_DESCRIPTION("SHA-1 Algorithm"); MODULE_LICENSE("GPL");
491 523 43 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 /* * net/tipc/core.h: Include file for TIPC global declarations * * Copyright (c) 2005-2006, 2013-2018 Ericsson AB * Copyright (c) 2005-2007, 2010-2013, Wind River Systems * Copyright (c) 2020, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _TIPC_CORE_H #define _TIPC_CORE_H #include <linux/tipc.h> #include <linux/tipc_config.h> #include <linux/tipc_netlink.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/mm.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/uaccess.h> #include <linux/interrupt.h> #include <linux/atomic.h> #include <linux/netdevice.h> #include <linux/in.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/rtnetlink.h> #include <linux/etherdevice.h> #include <net/netns/generic.h> #include <linux/rhashtable.h> #include <net/genetlink.h> #include <net/netns/hash.h> #ifdef pr_fmt #undef pr_fmt #endif #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt struct tipc_node; struct tipc_bearer; struct tipc_bc_base; struct tipc_link; struct tipc_topsrv; struct tipc_monitor; #ifdef CONFIG_TIPC_CRYPTO struct tipc_crypto; #endif #define TIPC_MOD_VER "2.0.0" #define NODE_HTABLE_SIZE 512 #define MAX_BEARERS 3 #define TIPC_DEF_MON_THRESHOLD 32 #define NODE_ID_LEN 16 #define NODE_ID_STR_LEN (NODE_ID_LEN * 2 + 1) extern unsigned int tipc_net_id __read_mostly; extern int sysctl_tipc_rmem[3] __read_mostly; extern int sysctl_tipc_named_timeout __read_mostly; struct tipc_net { u8 node_id[NODE_ID_LEN]; u32 node_addr; u32 trial_addr; unsigned long addr_trial_end; char node_id_string[NODE_ID_STR_LEN]; int net_id; int random; bool legacy_addr_format; /* Node table and node list */ spinlock_t node_list_lock; struct hlist_head node_htable[NODE_HTABLE_SIZE]; struct list_head node_list; u32 num_nodes; u32 num_links; /* Neighbor monitoring list */ struct tipc_monitor *monitors[MAX_BEARERS]; int mon_threshold; /* Bearer list */ struct tipc_bearer __rcu *bearer_list[MAX_BEARERS + 1]; /* Broadcast link */ spinlock_t bclock; struct tipc_bc_base *bcbase; struct tipc_link *bcl; /* Socket hash table */ struct rhashtable sk_rht; /* Name table */ spinlock_t nametbl_lock; struct name_table *nametbl; /* Topology subscription server */ struct tipc_topsrv *topsrv; atomic_t subscription_count; /* Cluster capabilities */ u16 capabilities; /* Tracing of node internal messages */ struct packet_type loopback_pt; #ifdef CONFIG_TIPC_CRYPTO /* TX crypto handler */ struct tipc_crypto *crypto_tx; #endif /* Work item for net finalize */ struct work_struct work; /* The numbers of work queues in schedule */ atomic_t wq_count; }; static inline struct tipc_net *tipc_net(struct net *net) { return net_generic(net, tipc_net_id); } static inline int tipc_netid(struct net *net) { return tipc_net(net)->net_id; } static inline struct list_head *tipc_nodes(struct net *net) { return &tipc_net(net)->node_list; } static inline struct name_table *tipc_name_table(struct net *net) { return tipc_net(net)->nametbl; } static inline struct tipc_topsrv *tipc_topsrv(struct net *net) { return tipc_net(net)->topsrv; } static inline unsigned int tipc_hashfn(u32 addr) { return addr & (NODE_HTABLE_SIZE - 1); } static inline u16 mod(u16 x) { return x & 0xffffu; } static inline int less_eq(u16 left, u16 right) { return mod(right - left) < 32768u; } static inline int more(u16 left, u16 right) { return !less_eq(left, right); } static inline int less(u16 left, u16 right) { return less_eq(left, right) && (mod(right) != mod(left)); } static inline int tipc_in_range(u16 val, u16 min, u16 max) { return !less(val, min) && !more(val, max); } static inline u32 tipc_net_hash_mixes(struct net *net, int tn_rand) { return net_hash_mix(&init_net) ^ net_hash_mix(net) ^ tn_rand; } static inline u32 hash128to32(char *bytes) { __be32 *tmp = (__be32 *)bytes; u32 res; res = ntohl(tmp[0] ^ tmp[1] ^ tmp[2] ^ tmp[3]); if (likely(res)) return res; return ntohl(tmp[0] | tmp[1] | tmp[2] | tmp[3]); } #ifdef CONFIG_SYSCTL int tipc_register_sysctl(void); void tipc_unregister_sysctl(void); #else #define tipc_register_sysctl() 0 #define tipc_unregister_sysctl() #endif #endif
3 3 114 4 4 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 // SPDX-License-Identifier: GPL-2.0-or-later /* * Cryptographic API. * * SHA1 Secure Hash Algorithm. * * Derived from cryptoapi implementation, adapted for in-place * scatterlist interface. * * Copyright (c) Alan Smithee. * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> * Copyright (c) Jean-Francois Dive <jef@linuxbe.org> */ #include <crypto/internal/hash.h> #include <linux/init.h> #include <linux/module.h> #include <linux/mm.h> #include <linux/types.h> #include <crypto/sha1.h> #include <crypto/sha1_base.h> #include <asm/byteorder.h> const u8 sha1_zero_message_hash[SHA1_DIGEST_SIZE] = { 0xda, 0x39, 0xa3, 0xee, 0x5e, 0x6b, 0x4b, 0x0d, 0x32, 0x55, 0xbf, 0xef, 0x95, 0x60, 0x18, 0x90, 0xaf, 0xd8, 0x07, 0x09 }; EXPORT_SYMBOL_GPL(sha1_zero_message_hash); static void sha1_generic_block_fn(struct sha1_state *sst, u8 const *src, int blocks) { u32 temp[SHA1_WORKSPACE_WORDS]; while (blocks--) { sha1_transform(sst->state, src, temp); src += SHA1_BLOCK_SIZE; } memzero_explicit(temp, sizeof(temp)); } int crypto_sha1_update(struct shash_desc *desc, const u8 *data, unsigned int len) { return sha1_base_do_update(desc, data, len, sha1_generic_block_fn); } EXPORT_SYMBOL(crypto_sha1_update); static int sha1_final(struct shash_desc *desc, u8 *out) { sha1_base_do_finalize(desc, sha1_generic_block_fn); return sha1_base_finish(desc, out); } int crypto_sha1_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { sha1_base_do_update(desc, data, len, sha1_generic_block_fn); return sha1_final(desc, out); } EXPORT_SYMBOL(crypto_sha1_finup); static struct shash_alg alg = { .digestsize = SHA1_DIGEST_SIZE, .init = sha1_base_init, .update = crypto_sha1_update, .final = sha1_final, .finup = crypto_sha1_finup, .descsize = sizeof(struct sha1_state), .base = { .cra_name = "sha1", .cra_driver_name= "sha1-generic", .cra_priority = 100, .cra_blocksize = SHA1_BLOCK_SIZE, .cra_module = THIS_MODULE, } }; static int __init sha1_generic_mod_init(void) { return crypto_register_shash(&alg); } static void __exit sha1_generic_mod_fini(void) { crypto_unregister_shash(&alg); } subsys_initcall(sha1_generic_mod_init); module_exit(sha1_generic_mod_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm"); MODULE_ALIAS_CRYPTO("sha1"); MODULE_ALIAS_CRYPTO("sha1-generic");
490 491 491 491 1560 1562 1345 1345 1345 1346 1344 1345 1564 1566 1348 1565 490 490 492 492 1559 1562 1346 1343 1344 1299 143 1562 1564 1561 1347 1564 492 480 18 458 458 1571 1569 445 447 1596 1336 458 458 458 1337 66 1274 1329 4 4 114 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 // SPDX-License-Identifier: GPL-2.0-or-later /* * net-sysfs.c - network device class and attributes * * Copyright (c) 2003 Stephen Hemminger <shemminger@osdl.org> */ #include <linux/capability.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/sched/isolation.h> #include <linux/nsproxy.h> #include <net/sock.h> #include <net/net_namespace.h> #include <linux/rtnetlink.h> #include <linux/vmalloc.h> #include <linux/export.h> #include <linux/jiffies.h> #include <linux/pm_runtime.h> #include <linux/of.h> #include <linux/of_net.h> #include <linux/cpu.h> #include <net/netdev_lock.h> #include <net/netdev_rx_queue.h> #include <net/rps.h> #include "dev.h" #include "net-sysfs.h" #ifdef CONFIG_SYSFS static const char fmt_hex[] = "%#x\n"; static const char fmt_dec[] = "%d\n"; static const char fmt_uint[] = "%u\n"; static const char fmt_ulong[] = "%lu\n"; static const char fmt_u64[] = "%llu\n"; /* Caller holds RTNL, netdev->lock or RCU */ static inline int dev_isalive(const struct net_device *dev) { return READ_ONCE(dev->reg_state) <= NETREG_REGISTERED; } /* There is a possible ABBA deadlock between rtnl_lock and kernfs_node->active, * when unregistering a net device and accessing associated sysfs files. The * potential deadlock is as follow: * * CPU 0 CPU 1 * * rtnl_lock vfs_read * unregister_netdevice_many kernfs_seq_start * device_del / kobject_put kernfs_get_active (kn->active++) * kernfs_drain sysfs_kf_seq_show * wait_event( rtnl_lock * kn->active == KN_DEACTIVATED_BIAS) -> waits on CPU 0 to release * -> waits on CPU 1 to decrease kn->active the rtnl lock. * * The historical fix was to use rtnl_trylock with restart_syscall to bail out * of sysfs operations when the lock couldn't be taken. This fixed the above * issue as it allowed CPU 1 to bail out of the ABBA situation. * * But it came with performances issues, as syscalls are being restarted in * loops when there was contention on the rtnl lock, with huge slow downs in * specific scenarios (e.g. lots of virtual interfaces created and userspace * daemons querying their attributes). * * The idea below is to bail out of the active kernfs_node protection * (kn->active) while trying to take the rtnl lock. * * This replaces rtnl_lock() and still has to be used with rtnl_unlock(). The * net device is guaranteed to be alive if this returns successfully. */ static int sysfs_rtnl_lock(struct kobject *kobj, struct attribute *attr, struct net_device *ndev) { struct kernfs_node *kn; int ret = 0; /* First, we hold a reference to the net device as the unregistration * path might run in parallel. This will ensure the net device and the * associated sysfs objects won't be freed while we try to take the rtnl * lock. */ dev_hold(ndev); /* sysfs_break_active_protection was introduced to allow self-removal of * devices and their associated sysfs files by bailing out of the * sysfs/kernfs protection. We do this here to allow the unregistration * path to complete in parallel. The following takes a reference on the * kobject and the kernfs_node being accessed. * * This works because we hold a reference onto the net device and the * unregistration path will wait for us eventually in netdev_run_todo * (outside an rtnl lock section). */ kn = sysfs_break_active_protection(kobj, attr); /* We can now try to take the rtnl lock. This can't deadlock us as the * unregistration path is able to drain sysfs files (kernfs_node) thanks * to the above dance. */ if (rtnl_lock_interruptible()) { ret = -ERESTARTSYS; goto unbreak; } /* Check dismantle on the device hasn't started, otherwise deny the * operation. */ if (!dev_isalive(ndev)) { rtnl_unlock(); ret = -ENODEV; goto unbreak; } /* We are now sure the device dismantle hasn't started nor that it can * start before we exit the locking section as we hold the rtnl lock. * There's no need to keep unbreaking the sysfs protection nor to hold * a net device reference from that point; that was only needed to take * the rtnl lock. */ unbreak: sysfs_unbreak_active_protection(kn); dev_put(ndev); return ret; } /* use same locking rules as GIF* ioctl's */ static ssize_t netdev_show(const struct device *dev, struct device_attribute *attr, char *buf, ssize_t (*format)(const struct net_device *, char *)) { struct net_device *ndev = to_net_dev(dev); ssize_t ret = -EINVAL; rcu_read_lock(); if (dev_isalive(ndev)) ret = (*format)(ndev, buf); rcu_read_unlock(); return ret; } /* generate a show function for simple field */ #define NETDEVICE_SHOW(field, format_string) \ static ssize_t format_##field(const struct net_device *dev, char *buf) \ { \ return sysfs_emit(buf, format_string, READ_ONCE(dev->field)); \ } \ static ssize_t field##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ return netdev_show(dev, attr, buf, format_##field); \ } \ #define NETDEVICE_SHOW_RO(field, format_string) \ NETDEVICE_SHOW(field, format_string); \ static DEVICE_ATTR_RO(field) #define NETDEVICE_SHOW_RW(field, format_string) \ NETDEVICE_SHOW(field, format_string); \ static DEVICE_ATTR_RW(field) /* use same locking and permission rules as SIF* ioctl's */ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len, int (*set)(struct net_device *, unsigned long)) { struct net_device *netdev = to_net_dev(dev); struct net *net = dev_net(netdev); unsigned long new; int ret; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; ret = kstrtoul(buf, 0, &new); if (ret) goto err; ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); if (ret) goto err; ret = (*set)(netdev, new); if (ret == 0) ret = len; rtnl_unlock(); err: return ret; } /* Same as netdev_store() but takes netdev_lock() instead of rtnl_lock() */ static ssize_t netdev_lock_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len, int (*set)(struct net_device *, unsigned long)) { struct net_device *netdev = to_net_dev(dev); struct net *net = dev_net(netdev); unsigned long new; int ret; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; ret = kstrtoul(buf, 0, &new); if (ret) return ret; netdev_lock(netdev); if (dev_isalive(netdev)) { ret = (*set)(netdev, new); if (ret == 0) ret = len; } netdev_unlock(netdev); return ret; } NETDEVICE_SHOW_RO(dev_id, fmt_hex); NETDEVICE_SHOW_RO(dev_port, fmt_dec); NETDEVICE_SHOW_RO(addr_assign_type, fmt_dec); NETDEVICE_SHOW_RO(addr_len, fmt_dec); NETDEVICE_SHOW_RO(ifindex, fmt_dec); NETDEVICE_SHOW_RO(type, fmt_dec); NETDEVICE_SHOW_RO(link_mode, fmt_dec); static ssize_t iflink_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *ndev = to_net_dev(dev); return sysfs_emit(buf, fmt_dec, dev_get_iflink(ndev)); } static DEVICE_ATTR_RO(iflink); static ssize_t format_name_assign_type(const struct net_device *dev, char *buf) { return sysfs_emit(buf, fmt_dec, READ_ONCE(dev->name_assign_type)); } static ssize_t name_assign_type_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *ndev = to_net_dev(dev); ssize_t ret = -EINVAL; if (READ_ONCE(ndev->name_assign_type) != NET_NAME_UNKNOWN) ret = netdev_show(dev, attr, buf, format_name_assign_type); return ret; } static DEVICE_ATTR_RO(name_assign_type); /* use same locking rules as GIFHWADDR ioctl's (dev_get_mac_address()) */ static ssize_t address_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *ndev = to_net_dev(dev); ssize_t ret = -EINVAL; down_read(&dev_addr_sem); rcu_read_lock(); if (dev_isalive(ndev)) ret = sysfs_format_mac(buf, ndev->dev_addr, ndev->addr_len); rcu_read_unlock(); up_read(&dev_addr_sem); return ret; } static DEVICE_ATTR_RO(address); static ssize_t broadcast_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *ndev = to_net_dev(dev); int ret = -EINVAL; rcu_read_lock(); if (dev_isalive(ndev)) ret = sysfs_format_mac(buf, ndev->broadcast, ndev->addr_len); rcu_read_unlock(); return ret; } static DEVICE_ATTR_RO(broadcast); static int change_carrier(struct net_device *dev, unsigned long new_carrier) { if (!netif_running(dev)) return -EINVAL; return dev_change_carrier(dev, (bool)new_carrier); } static ssize_t carrier_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { struct net_device *netdev = to_net_dev(dev); /* The check is also done in change_carrier; this helps returning early * without hitting the locking section in netdev_store. */ if (!netdev->netdev_ops->ndo_change_carrier) return -EOPNOTSUPP; return netdev_store(dev, attr, buf, len, change_carrier); } static ssize_t carrier_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); int ret; ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); if (ret) return ret; ret = -EINVAL; if (netif_running(netdev)) { /* Synchronize carrier state with link watch, * see also rtnl_getlink(). */ linkwatch_sync_dev(netdev); ret = sysfs_emit(buf, fmt_dec, !!netif_carrier_ok(netdev)); } rtnl_unlock(); return ret; } static DEVICE_ATTR_RW(carrier); static ssize_t speed_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); int ret = -EINVAL; /* The check is also done in __ethtool_get_link_ksettings; this helps * returning early without hitting the locking section below. */ if (!netdev->ethtool_ops->get_link_ksettings) return ret; ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); if (ret) return ret; ret = -EINVAL; if (netif_running(netdev)) { struct ethtool_link_ksettings cmd; if (!__ethtool_get_link_ksettings(netdev, &cmd)) ret = sysfs_emit(buf, fmt_dec, cmd.base.speed); } rtnl_unlock(); return ret; } static DEVICE_ATTR_RO(speed); static ssize_t duplex_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); int ret = -EINVAL; /* The check is also done in __ethtool_get_link_ksettings; this helps * returning early without hitting the locking section below. */ if (!netdev->ethtool_ops->get_link_ksettings) return ret; ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); if (ret) return ret; ret = -EINVAL; if (netif_running(netdev)) { struct ethtool_link_ksettings cmd; if (!__ethtool_get_link_ksettings(netdev, &cmd)) { const char *duplex; switch (cmd.base.duplex) { case DUPLEX_HALF: duplex = "half"; break; case DUPLEX_FULL: duplex = "full"; break; default: duplex = "unknown"; break; } ret = sysfs_emit(buf, "%s\n", duplex); } } rtnl_unlock(); return ret; } static DEVICE_ATTR_RO(duplex); static ssize_t testing_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); if (netif_running(netdev)) return sysfs_emit(buf, fmt_dec, !!netif_testing(netdev)); return -EINVAL; } static DEVICE_ATTR_RO(testing); static ssize_t dormant_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); if (netif_running(netdev)) return sysfs_emit(buf, fmt_dec, !!netif_dormant(netdev)); return -EINVAL; } static DEVICE_ATTR_RO(dormant); static const char *const operstates[] = { "unknown", "notpresent", /* currently unused */ "down", "lowerlayerdown", "testing", "dormant", "up" }; static ssize_t operstate_show(struct device *dev, struct device_attribute *attr, char *buf) { const struct net_device *netdev = to_net_dev(dev); unsigned char operstate; operstate = READ_ONCE(netdev->operstate); if (!netif_running(netdev)) operstate = IF_OPER_DOWN; if (operstate >= ARRAY_SIZE(operstates)) return -EINVAL; /* should not happen */ return sysfs_emit(buf, "%s\n", operstates[operstate]); } static DEVICE_ATTR_RO(operstate); static ssize_t carrier_changes_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); return sysfs_emit(buf, fmt_dec, atomic_read(&netdev->carrier_up_count) + atomic_read(&netdev->carrier_down_count)); } static DEVICE_ATTR_RO(carrier_changes); static ssize_t carrier_up_count_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); return sysfs_emit(buf, fmt_dec, atomic_read(&netdev->carrier_up_count)); } static DEVICE_ATTR_RO(carrier_up_count); static ssize_t carrier_down_count_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); return sysfs_emit(buf, fmt_dec, atomic_read(&netdev->carrier_down_count)); } static DEVICE_ATTR_RO(carrier_down_count); /* read-write attributes */ static int change_mtu(struct net_device *dev, unsigned long new_mtu) { return dev_set_mtu(dev, (int)new_mtu); } static ssize_t mtu_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { return netdev_store(dev, attr, buf, len, change_mtu); } NETDEVICE_SHOW_RW(mtu, fmt_dec); static int change_flags(struct net_device *dev, unsigned long new_flags) { return dev_change_flags(dev, (unsigned int)new_flags, NULL); } static ssize_t flags_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { return netdev_store(dev, attr, buf, len, change_flags); } NETDEVICE_SHOW_RW(flags, fmt_hex); static ssize_t tx_queue_len_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { if (!capable(CAP_NET_ADMIN)) return -EPERM; return netdev_store(dev, attr, buf, len, dev_change_tx_queue_len); } NETDEVICE_SHOW_RW(tx_queue_len, fmt_dec); static int change_gro_flush_timeout(struct net_device *dev, unsigned long val) { netdev_set_gro_flush_timeout(dev, val); return 0; } static ssize_t gro_flush_timeout_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { if (!capable(CAP_NET_ADMIN)) return -EPERM; return netdev_lock_store(dev, attr, buf, len, change_gro_flush_timeout); } NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong); static int change_napi_defer_hard_irqs(struct net_device *dev, unsigned long val) { if (val > S32_MAX) return -ERANGE; netdev_set_defer_hard_irqs(dev, (u32)val); return 0; } static ssize_t napi_defer_hard_irqs_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { if (!capable(CAP_NET_ADMIN)) return -EPERM; return netdev_lock_store(dev, attr, buf, len, change_napi_defer_hard_irqs); } NETDEVICE_SHOW_RW(napi_defer_hard_irqs, fmt_uint); static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { struct net_device *netdev = to_net_dev(dev); struct net *net = dev_net(netdev); size_t count = len; ssize_t ret; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; /* ignore trailing newline */ if (len > 0 && buf[len - 1] == '\n') --count; ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); if (ret) return ret; ret = dev_set_alias(netdev, buf, count); if (ret < 0) goto err; ret = len; netdev_state_change(netdev); err: rtnl_unlock(); return ret; } static ssize_t ifalias_show(struct device *dev, struct device_attribute *attr, char *buf) { const struct net_device *netdev = to_net_dev(dev); char tmp[IFALIASZ]; ssize_t ret; ret = dev_get_alias(netdev, tmp, sizeof(tmp)); if (ret > 0) ret = sysfs_emit(buf, "%s\n", tmp); return ret; } static DEVICE_ATTR_RW(ifalias); static int change_group(struct net_device *dev, unsigned long new_group) { dev_set_group(dev, (int)new_group); return 0; } static ssize_t group_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { return netdev_store(dev, attr, buf, len, change_group); } NETDEVICE_SHOW(group, fmt_dec); static DEVICE_ATTR(netdev_group, 0644, group_show, group_store); static int change_proto_down(struct net_device *dev, unsigned long proto_down) { return dev_change_proto_down(dev, (bool)proto_down); } static ssize_t proto_down_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { return netdev_store(dev, attr, buf, len, change_proto_down); } NETDEVICE_SHOW_RW(proto_down, fmt_dec); static ssize_t phys_port_id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); struct netdev_phys_item_id ppid; ssize_t ret; /* The check is also done in dev_get_phys_port_id; this helps returning * early without hitting the locking section below. */ if (!netdev->netdev_ops->ndo_get_phys_port_id) return -EOPNOTSUPP; ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); if (ret) return ret; ret = dev_get_phys_port_id(netdev, &ppid); if (!ret) ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); rtnl_unlock(); return ret; } static DEVICE_ATTR_RO(phys_port_id); static ssize_t phys_port_name_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); char name[IFNAMSIZ]; ssize_t ret; /* The checks are also done in dev_get_phys_port_name; this helps * returning early without hitting the locking section below. */ if (!netdev->netdev_ops->ndo_get_phys_port_name && !netdev->devlink_port) return -EOPNOTSUPP; ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); if (ret) return ret; ret = dev_get_phys_port_name(netdev, name, sizeof(name)); if (!ret) ret = sysfs_emit(buf, "%s\n", name); rtnl_unlock(); return ret; } static DEVICE_ATTR_RO(phys_port_name); static ssize_t phys_switch_id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); struct netdev_phys_item_id ppid = { }; ssize_t ret; /* The checks are also done in dev_get_phys_port_name; this helps * returning early without hitting the locking section below. This works * because recurse is false when calling dev_get_port_parent_id. */ if (!netdev->netdev_ops->ndo_get_port_parent_id && !netdev->devlink_port) return -EOPNOTSUPP; ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); if (ret) return ret; ret = dev_get_port_parent_id(netdev, &ppid, false); if (!ret) ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); rtnl_unlock(); return ret; } static DEVICE_ATTR_RO(phys_switch_id); static ssize_t threaded_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); ssize_t ret = -EINVAL; rcu_read_lock(); if (dev_isalive(netdev)) ret = sysfs_emit(buf, fmt_dec, READ_ONCE(netdev->threaded)); rcu_read_unlock(); return ret; } static int modify_napi_threaded(struct net_device *dev, unsigned long val) { int ret; if (list_empty(&dev->napi_list)) return -EOPNOTSUPP; if (val != 0 && val != 1) return -EOPNOTSUPP; ret = dev_set_threaded(dev, val); return ret; } static ssize_t threaded_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { return netdev_lock_store(dev, attr, buf, len, modify_napi_threaded); } static DEVICE_ATTR_RW(threaded); static struct attribute *net_class_attrs[] __ro_after_init = { &dev_attr_netdev_group.attr, &dev_attr_type.attr, &dev_attr_dev_id.attr, &dev_attr_dev_port.attr, &dev_attr_iflink.attr, &dev_attr_ifindex.attr, &dev_attr_name_assign_type.attr, &dev_attr_addr_assign_type.attr, &dev_attr_addr_len.attr, &dev_attr_link_mode.attr, &dev_attr_address.attr, &dev_attr_broadcast.attr, &dev_attr_speed.attr, &dev_attr_duplex.attr, &dev_attr_dormant.attr, &dev_attr_testing.attr, &dev_attr_operstate.attr, &dev_attr_carrier_changes.attr, &dev_attr_ifalias.attr, &dev_attr_carrier.attr, &dev_attr_mtu.attr, &dev_attr_flags.attr, &dev_attr_tx_queue_len.attr, &dev_attr_gro_flush_timeout.attr, &dev_attr_napi_defer_hard_irqs.attr, &dev_attr_phys_port_id.attr, &dev_attr_phys_port_name.attr, &dev_attr_phys_switch_id.attr, &dev_attr_proto_down.attr, &dev_attr_carrier_up_count.attr, &dev_attr_carrier_down_count.attr, &dev_attr_threaded.attr, NULL, }; ATTRIBUTE_GROUPS(net_class); /* Show a given an attribute in the statistics group */ static ssize_t netstat_show(const struct device *d, struct device_attribute *attr, char *buf, unsigned long offset) { struct net_device *dev = to_net_dev(d); ssize_t ret = -EINVAL; WARN_ON(offset > sizeof(struct rtnl_link_stats64) || offset % sizeof(u64) != 0); rcu_read_lock(); if (dev_isalive(dev)) { struct rtnl_link_stats64 temp; const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); ret = sysfs_emit(buf, fmt_u64, *(u64 *)(((u8 *)stats) + offset)); } rcu_read_unlock(); return ret; } /* generate a read-only statistics attribute */ #define NETSTAT_ENTRY(name) \ static ssize_t name##_show(struct device *d, \ struct device_attribute *attr, char *buf) \ { \ return netstat_show(d, attr, buf, \ offsetof(struct rtnl_link_stats64, name)); \ } \ static DEVICE_ATTR_RO(name) NETSTAT_ENTRY(rx_packets); NETSTAT_ENTRY(tx_packets); NETSTAT_ENTRY(rx_bytes); NETSTAT_ENTRY(tx_bytes); NETSTAT_ENTRY(rx_errors); NETSTAT_ENTRY(tx_errors); NETSTAT_ENTRY(rx_dropped); NETSTAT_ENTRY(tx_dropped); NETSTAT_ENTRY(multicast); NETSTAT_ENTRY(collisions); NETSTAT_ENTRY(rx_length_errors); NETSTAT_ENTRY(rx_over_errors); NETSTAT_ENTRY(rx_crc_errors); NETSTAT_ENTRY(rx_frame_errors); NETSTAT_ENTRY(rx_fifo_errors); NETSTAT_ENTRY(rx_missed_errors); NETSTAT_ENTRY(tx_aborted_errors); NETSTAT_ENTRY(tx_carrier_errors); NETSTAT_ENTRY(tx_fifo_errors); NETSTAT_ENTRY(tx_heartbeat_errors); NETSTAT_ENTRY(tx_window_errors); NETSTAT_ENTRY(rx_compressed); NETSTAT_ENTRY(tx_compressed); NETSTAT_ENTRY(rx_nohandler); static struct attribute *netstat_attrs[] __ro_after_init = { &dev_attr_rx_packets.attr, &dev_attr_tx_packets.attr, &dev_attr_rx_bytes.attr, &dev_attr_tx_bytes.attr, &dev_attr_rx_errors.attr, &dev_attr_tx_errors.attr, &dev_attr_rx_dropped.attr, &dev_attr_tx_dropped.attr, &dev_attr_multicast.attr, &dev_attr_collisions.attr, &dev_attr_rx_length_errors.attr, &dev_attr_rx_over_errors.attr, &dev_attr_rx_crc_errors.attr, &dev_attr_rx_frame_errors.attr, &dev_attr_rx_fifo_errors.attr, &dev_attr_rx_missed_errors.attr, &dev_attr_tx_aborted_errors.attr, &dev_attr_tx_carrier_errors.attr, &dev_attr_tx_fifo_errors.attr, &dev_attr_tx_heartbeat_errors.attr, &dev_attr_tx_window_errors.attr, &dev_attr_rx_compressed.attr, &dev_attr_tx_compressed.attr, &dev_attr_rx_nohandler.attr, NULL }; static const struct attribute_group netstat_group = { .name = "statistics", .attrs = netstat_attrs, }; static struct attribute *wireless_attrs[] = { NULL }; static const struct attribute_group wireless_group = { .name = "wireless", .attrs = wireless_attrs, }; static bool wireless_group_needed(struct net_device *ndev) { #if IS_ENABLED(CONFIG_CFG80211) if (ndev->ieee80211_ptr) return true; #endif #if IS_ENABLED(CONFIG_WIRELESS_EXT) if (ndev->wireless_handlers) return true; #endif return false; } #else /* CONFIG_SYSFS */ #define net_class_groups NULL #endif /* CONFIG_SYSFS */ #ifdef CONFIG_SYSFS #define to_rx_queue_attr(_attr) \ container_of(_attr, struct rx_queue_attribute, attr) #define to_rx_queue(obj) container_of(obj, struct netdev_rx_queue, kobj) static ssize_t rx_queue_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { const struct rx_queue_attribute *attribute = to_rx_queue_attr(attr); struct netdev_rx_queue *queue = to_rx_queue(kobj); if (!attribute->show) return -EIO; return attribute->show(queue, buf); } static ssize_t rx_queue_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { const struct rx_queue_attribute *attribute = to_rx_queue_attr(attr); struct netdev_rx_queue *queue = to_rx_queue(kobj); if (!attribute->store) return -EIO; return attribute->store(queue, buf, count); } static const struct sysfs_ops rx_queue_sysfs_ops = { .show = rx_queue_attr_show, .store = rx_queue_attr_store, }; #ifdef CONFIG_RPS static ssize_t show_rps_map(struct netdev_rx_queue *queue, char *buf) { struct rps_map *map; cpumask_var_t mask; int i, len; if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; rcu_read_lock(); map = rcu_dereference(queue->rps_map); if (map) for (i = 0; i < map->len; i++) cpumask_set_cpu(map->cpus[i], mask); len = sysfs_emit(buf, "%*pb\n", cpumask_pr_args(mask)); rcu_read_unlock(); free_cpumask_var(mask); return len < PAGE_SIZE ? len : -EINVAL; } static int netdev_rx_queue_set_rps_mask(struct netdev_rx_queue *queue, cpumask_var_t mask) { static DEFINE_MUTEX(rps_map_mutex); struct rps_map *old_map, *map; int cpu, i; map = kzalloc(max_t(unsigned int, RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES), GFP_KERNEL); if (!map) return -ENOMEM; i = 0; for_each_cpu_and(cpu, mask, cpu_online_mask) map->cpus[i++] = cpu; if (i) { map->len = i; } else { kfree(map); map = NULL; } mutex_lock(&rps_map_mutex); old_map = rcu_dereference_protected(queue->rps_map, mutex_is_locked(&rps_map_mutex)); rcu_assign_pointer(queue->rps_map, map); if (map) static_branch_inc(&rps_needed); if (old_map) static_branch_dec(&rps_needed); mutex_unlock(&rps_map_mutex); if (old_map) kfree_rcu(old_map, rcu); return 0; } int rps_cpumask_housekeeping(struct cpumask *mask) { if (!cpumask_empty(mask)) { cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_DOMAIN)); cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_WQ)); if (cpumask_empty(mask)) return -EINVAL; } return 0; } static ssize_t store_rps_map(struct netdev_rx_queue *queue, const char *buf, size_t len) { cpumask_var_t mask; int err; if (!capable(CAP_NET_ADMIN)) return -EPERM; if (!alloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits); if (err) goto out; err = rps_cpumask_housekeeping(mask); if (err) goto out; err = netdev_rx_queue_set_rps_mask(queue, mask); out: free_cpumask_var(mask); return err ? : len; } static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, char *buf) { struct rps_dev_flow_table *flow_table; unsigned long val = 0; rcu_read_lock(); flow_table = rcu_dereference(queue->rps_flow_table); if (flow_table) val = 1UL << flow_table->log; rcu_read_unlock(); return sysfs_emit(buf, "%lu\n", val); } static void rps_dev_flow_table_release(struct rcu_head *rcu) { struct rps_dev_flow_table *table = container_of(rcu, struct rps_dev_flow_table, rcu); vfree(table); } static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, const char *buf, size_t len) { unsigned long mask, count; struct rps_dev_flow_table *table, *old_table; static DEFINE_SPINLOCK(rps_dev_flow_lock); int rc; if (!capable(CAP_NET_ADMIN)) return -EPERM; rc = kstrtoul(buf, 0, &count); if (rc < 0) return rc; if (count) { mask = count - 1; /* mask = roundup_pow_of_two(count) - 1; * without overflows... */ while ((mask | (mask >> 1)) != mask) mask |= (mask >> 1); /* On 64 bit arches, must check mask fits in table->mask (u32), * and on 32bit arches, must check * RPS_DEV_FLOW_TABLE_SIZE(mask + 1) doesn't overflow. */ #if BITS_PER_LONG > 32 if (mask > (unsigned long)(u32)mask) return -EINVAL; #else if (mask > (ULONG_MAX - RPS_DEV_FLOW_TABLE_SIZE(1)) / sizeof(struct rps_dev_flow)) { /* Enforce a limit to prevent overflow */ return -EINVAL; } #endif table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(mask + 1)); if (!table) return -ENOMEM; table->log = ilog2(mask) + 1; for (count = 0; count <= mask; count++) table->flows[count].cpu = RPS_NO_CPU; } else { table = NULL; } spin_lock(&rps_dev_flow_lock); old_table = rcu_dereference_protected(queue->rps_flow_table, lockdep_is_held(&rps_dev_flow_lock)); rcu_assign_pointer(queue->rps_flow_table, table); spin_unlock(&rps_dev_flow_lock); if (old_table) call_rcu(&old_table->rcu, rps_dev_flow_table_release); return len; } static struct rx_queue_attribute rps_cpus_attribute __ro_after_init = __ATTR(rps_cpus, 0644, show_rps_map, store_rps_map); static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute __ro_after_init = __ATTR(rps_flow_cnt, 0644, show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt); #endif /* CONFIG_RPS */ static struct attribute *rx_queue_default_attrs[] __ro_after_init = { #ifdef CONFIG_RPS &rps_cpus_attribute.attr, &rps_dev_flow_table_cnt_attribute.attr, #endif NULL }; ATTRIBUTE_GROUPS(rx_queue_default); static void rx_queue_release(struct kobject *kobj) { struct netdev_rx_queue *queue = to_rx_queue(kobj); #ifdef CONFIG_RPS struct rps_map *map; struct rps_dev_flow_table *flow_table; map = rcu_dereference_protected(queue->rps_map, 1); if (map) { RCU_INIT_POINTER(queue->rps_map, NULL); kfree_rcu(map, rcu); } flow_table = rcu_dereference_protected(queue->rps_flow_table, 1); if (flow_table) { RCU_INIT_POINTER(queue->rps_flow_table, NULL); call_rcu(&flow_table->rcu, rps_dev_flow_table_release); } #endif memset(kobj, 0, sizeof(*kobj)); netdev_put(queue->dev, &queue->dev_tracker); } static const void *rx_queue_namespace(const struct kobject *kobj) { struct netdev_rx_queue *queue = to_rx_queue(kobj); struct device *dev = &queue->dev->dev; const void *ns = NULL; if (dev->class && dev->class->namespace) ns = dev->class->namespace(dev); return ns; } static void rx_queue_get_ownership(const struct kobject *kobj, kuid_t *uid, kgid_t *gid) { const struct net *net = rx_queue_namespace(kobj); net_ns_get_ownership(net, uid, gid); } static const struct kobj_type rx_queue_ktype = { .sysfs_ops = &rx_queue_sysfs_ops, .release = rx_queue_release, .namespace = rx_queue_namespace, .get_ownership = rx_queue_get_ownership, }; static int rx_queue_default_mask(struct net_device *dev, struct netdev_rx_queue *queue) { #if IS_ENABLED(CONFIG_RPS) && IS_ENABLED(CONFIG_SYSCTL) struct cpumask *rps_default_mask = READ_ONCE(dev_net(dev)->core.rps_default_mask); if (rps_default_mask && !cpumask_empty(rps_default_mask)) return netdev_rx_queue_set_rps_mask(queue, rps_default_mask); #endif return 0; } static int rx_queue_add_kobject(struct net_device *dev, int index) { struct netdev_rx_queue *queue = dev->_rx + index; struct kobject *kobj = &queue->kobj; int error = 0; /* Rx queues are cleared in rx_queue_release to allow later * re-registration. This is triggered when their kobj refcount is * dropped. * * If a queue is removed while both a read (or write) operation and a * the re-addition of the same queue are pending (waiting on rntl_lock) * it might happen that the re-addition will execute before the read, * making the initial removal to never happen (queue's kobj refcount * won't drop enough because of the pending read). In such rare case, * return to allow the removal operation to complete. */ if (unlikely(kobj->state_initialized)) { netdev_warn_once(dev, "Cannot re-add rx queues before their removal completed"); return -EAGAIN; } /* Kobject_put later will trigger rx_queue_release call which * decreases dev refcount: Take that reference here */ netdev_hold(queue->dev, &queue->dev_tracker, GFP_KERNEL); kobj->kset = dev->queues_kset; error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL, "rx-%u", index); if (error) goto err; queue->groups = rx_queue_default_groups; error = sysfs_create_groups(kobj, queue->groups); if (error) goto err; if (dev->sysfs_rx_queue_group) { error = sysfs_create_group(kobj, dev->sysfs_rx_queue_group); if (error) goto err_default_groups; } error = rx_queue_default_mask(dev, queue); if (error) goto err_default_groups; kobject_uevent(kobj, KOBJ_ADD); return error; err_default_groups: sysfs_remove_groups(kobj, queue->groups); err: kobject_put(kobj); return error; } static int rx_queue_change_owner(struct net_device *dev, int index, kuid_t kuid, kgid_t kgid) { struct netdev_rx_queue *queue = dev->_rx + index; struct kobject *kobj = &queue->kobj; int error; error = sysfs_change_owner(kobj, kuid, kgid); if (error) return error; if (dev->sysfs_rx_queue_group) error = sysfs_group_change_owner( kobj, dev->sysfs_rx_queue_group, kuid, kgid); return error; } #endif /* CONFIG_SYSFS */ int net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) { #ifdef CONFIG_SYSFS int i; int error = 0; #ifndef CONFIG_RPS if (!dev->sysfs_rx_queue_group) return 0; #endif for (i = old_num; i < new_num; i++) { error = rx_queue_add_kobject(dev, i); if (error) { new_num = old_num; break; } } while (--i >= new_num) { struct netdev_rx_queue *queue = &dev->_rx[i]; struct kobject *kobj = &queue->kobj; if (!refcount_read(&dev_net(dev)->ns.count)) kobj->uevent_suppress = 1; if (dev->sysfs_rx_queue_group) sysfs_remove_group(kobj, dev->sysfs_rx_queue_group); sysfs_remove_groups(kobj, queue->groups); kobject_put(kobj); } return error; #else return 0; #endif } static int net_rx_queue_change_owner(struct net_device *dev, int num, kuid_t kuid, kgid_t kgid) { #ifdef CONFIG_SYSFS int error = 0; int i; #ifndef CONFIG_RPS if (!dev->sysfs_rx_queue_group) return 0; #endif for (i = 0; i < num; i++) { error = rx_queue_change_owner(dev, i, kuid, kgid); if (error) break; } return error; #else return 0; #endif } #ifdef CONFIG_SYSFS /* * netdev_queue sysfs structures and functions. */ struct netdev_queue_attribute { struct attribute attr; ssize_t (*show)(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, char *buf); ssize_t (*store)(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, const char *buf, size_t len); }; #define to_netdev_queue_attr(_attr) \ container_of(_attr, struct netdev_queue_attribute, attr) #define to_netdev_queue(obj) container_of(obj, struct netdev_queue, kobj) static ssize_t netdev_queue_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { const struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr); struct netdev_queue *queue = to_netdev_queue(kobj); if (!attribute->show) return -EIO; return attribute->show(kobj, attr, queue, buf); } static ssize_t netdev_queue_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { const struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr); struct netdev_queue *queue = to_netdev_queue(kobj); if (!attribute->store) return -EIO; return attribute->store(kobj, attr, queue, buf, count); } static const struct sysfs_ops netdev_queue_sysfs_ops = { .show = netdev_queue_attr_show, .store = netdev_queue_attr_store, }; static ssize_t tx_timeout_show(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, char *buf) { unsigned long trans_timeout = atomic_long_read(&queue->trans_timeout); return sysfs_emit(buf, fmt_ulong, trans_timeout); } static unsigned int get_netdev_queue_index(struct netdev_queue *queue) { struct net_device *dev = queue->dev; unsigned int i; i = queue - dev->_tx; BUG_ON(i >= dev->num_tx_queues); return i; } static ssize_t traffic_class_show(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; int num_tc, tc, index, ret; if (!netif_is_multiqueue(dev)) return -ENOENT; ret = sysfs_rtnl_lock(kobj, attr, queue->dev); if (ret) return ret; index = get_netdev_queue_index(queue); /* If queue belongs to subordinate dev use its TC mapping */ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; num_tc = dev->num_tc; tc = netdev_txq_to_tc(dev, index); rtnl_unlock(); if (tc < 0) return -EINVAL; /* We can report the traffic class one of two ways: * Subordinate device traffic classes are reported with the traffic * class first, and then the subordinate class so for example TC0 on * subordinate device 2 will be reported as "0-2". If the queue * belongs to the root device it will be reported with just the * traffic class, so just "0" for TC 0 for example. */ return num_tc < 0 ? sysfs_emit(buf, "%d%d\n", tc, num_tc) : sysfs_emit(buf, "%d\n", tc); } #ifdef CONFIG_XPS static ssize_t tx_maxrate_show(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, char *buf) { return sysfs_emit(buf, "%lu\n", queue->tx_maxrate); } static ssize_t tx_maxrate_store(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, const char *buf, size_t len) { int err, index = get_netdev_queue_index(queue); struct net_device *dev = queue->dev; u32 rate = 0; if (!capable(CAP_NET_ADMIN)) return -EPERM; /* The check is also done later; this helps returning early without * hitting the locking section below. */ if (!dev->netdev_ops->ndo_set_tx_maxrate) return -EOPNOTSUPP; err = kstrtou32(buf, 10, &rate); if (err < 0) return err; err = sysfs_rtnl_lock(kobj, attr, dev); if (err) return err; err = -EOPNOTSUPP; netdev_lock_ops(dev); if (dev->netdev_ops->ndo_set_tx_maxrate) err = dev->netdev_ops->ndo_set_tx_maxrate(dev, index, rate); netdev_unlock_ops(dev); if (!err) { queue->tx_maxrate = rate; rtnl_unlock(); return len; } rtnl_unlock(); return err; } static struct netdev_queue_attribute queue_tx_maxrate __ro_after_init = __ATTR_RW(tx_maxrate); #endif static struct netdev_queue_attribute queue_trans_timeout __ro_after_init = __ATTR_RO(tx_timeout); static struct netdev_queue_attribute queue_traffic_class __ro_after_init = __ATTR_RO(traffic_class); #ifdef CONFIG_BQL /* * Byte queue limits sysfs structures and functions. */ static ssize_t bql_show(char *buf, unsigned int value) { return sysfs_emit(buf, "%u\n", value); } static ssize_t bql_set(const char *buf, const size_t count, unsigned int *pvalue) { unsigned int value; int err; if (!strcmp(buf, "max") || !strcmp(buf, "max\n")) { value = DQL_MAX_LIMIT; } else { err = kstrtouint(buf, 10, &value); if (err < 0) return err; if (value > DQL_MAX_LIMIT) return -EINVAL; } *pvalue = value; return count; } static ssize_t bql_show_hold_time(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, char *buf) { struct dql *dql = &queue->dql; return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time)); } static ssize_t bql_set_hold_time(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, const char *buf, size_t len) { struct dql *dql = &queue->dql; unsigned int value; int err; err = kstrtouint(buf, 10, &value); if (err < 0) return err; dql->slack_hold_time = msecs_to_jiffies(value); return len; } static struct netdev_queue_attribute bql_hold_time_attribute __ro_after_init = __ATTR(hold_time, 0644, bql_show_hold_time, bql_set_hold_time); static ssize_t bql_show_stall_thrs(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, char *buf) { struct dql *dql = &queue->dql; return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->stall_thrs)); } static ssize_t bql_set_stall_thrs(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, const char *buf, size_t len) { struct dql *dql = &queue->dql; unsigned int value; int err; err = kstrtouint(buf, 10, &value); if (err < 0) return err; value = msecs_to_jiffies(value); if (value && (value < 4 || value > 4 / 2 * BITS_PER_LONG)) return -ERANGE; if (!dql->stall_thrs && value) dql->last_reap = jiffies; /* Force last_reap to be live */ smp_wmb(); dql->stall_thrs = value; return len; } static struct netdev_queue_attribute bql_stall_thrs_attribute __ro_after_init = __ATTR(stall_thrs, 0644, bql_show_stall_thrs, bql_set_stall_thrs); static ssize_t bql_show_stall_max(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, char *buf) { return sysfs_emit(buf, "%u\n", READ_ONCE(queue->dql.stall_max)); } static ssize_t bql_set_stall_max(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, const char *buf, size_t len) { WRITE_ONCE(queue->dql.stall_max, 0); return len; } static struct netdev_queue_attribute bql_stall_max_attribute __ro_after_init = __ATTR(stall_max, 0644, bql_show_stall_max, bql_set_stall_max); static ssize_t bql_show_stall_cnt(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, char *buf) { struct dql *dql = &queue->dql; return sysfs_emit(buf, "%lu\n", dql->stall_cnt); } static struct netdev_queue_attribute bql_stall_cnt_attribute __ro_after_init = __ATTR(stall_cnt, 0444, bql_show_stall_cnt, NULL); static ssize_t bql_show_inflight(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, char *buf) { struct dql *dql = &queue->dql; return sysfs_emit(buf, "%u\n", dql->num_queued - dql->num_completed); } static struct netdev_queue_attribute bql_inflight_attribute __ro_after_init = __ATTR(inflight, 0444, bql_show_inflight, NULL); #define BQL_ATTR(NAME, FIELD) \ static ssize_t bql_show_ ## NAME(struct kobject *kobj, \ struct attribute *attr, \ struct netdev_queue *queue, char *buf) \ { \ return bql_show(buf, queue->dql.FIELD); \ } \ \ static ssize_t bql_set_ ## NAME(struct kobject *kobj, \ struct attribute *attr, \ struct netdev_queue *queue, \ const char *buf, size_t len) \ { \ return bql_set(buf, len, &queue->dql.FIELD); \ } \ \ static struct netdev_queue_attribute bql_ ## NAME ## _attribute __ro_after_init \ = __ATTR(NAME, 0644, \ bql_show_ ## NAME, bql_set_ ## NAME) BQL_ATTR(limit, limit); BQL_ATTR(limit_max, max_limit); BQL_ATTR(limit_min, min_limit); static struct attribute *dql_attrs[] __ro_after_init = { &bql_limit_attribute.attr, &bql_limit_max_attribute.attr, &bql_limit_min_attribute.attr, &bql_hold_time_attribute.attr, &bql_inflight_attribute.attr, &bql_stall_thrs_attribute.attr, &bql_stall_cnt_attribute.attr, &bql_stall_max_attribute.attr, NULL }; static const struct attribute_group dql_group = { .name = "byte_queue_limits", .attrs = dql_attrs, }; #else /* Fake declaration, all the code using it should be dead */ static const struct attribute_group dql_group = {}; #endif /* CONFIG_BQL */ #ifdef CONFIG_XPS static ssize_t xps_queue_show(struct net_device *dev, unsigned int index, int tc, char *buf, enum xps_map_type type) { struct xps_dev_maps *dev_maps; unsigned long *mask; unsigned int nr_ids; int j, len; rcu_read_lock(); dev_maps = rcu_dereference(dev->xps_maps[type]); /* Default to nr_cpu_ids/dev->num_rx_queues and do not just return 0 * when dev_maps hasn't been allocated yet, to be backward compatible. */ nr_ids = dev_maps ? dev_maps->nr_ids : (type == XPS_CPUS ? nr_cpu_ids : dev->num_rx_queues); mask = bitmap_zalloc(nr_ids, GFP_NOWAIT); if (!mask) { rcu_read_unlock(); return -ENOMEM; } if (!dev_maps || tc >= dev_maps->num_tc) goto out_no_maps; for (j = 0; j < nr_ids; j++) { int i, tci = j * dev_maps->num_tc + tc; struct xps_map *map; map = rcu_dereference(dev_maps->attr_map[tci]); if (!map) continue; for (i = map->len; i--;) { if (map->queues[i] == index) { __set_bit(j, mask); break; } } } out_no_maps: rcu_read_unlock(); len = bitmap_print_to_pagebuf(false, buf, mask, nr_ids); bitmap_free(mask); return len < PAGE_SIZE ? len : -EINVAL; } static ssize_t xps_cpus_show(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; unsigned int index; int len, tc, ret; if (!netif_is_multiqueue(dev)) return -ENOENT; index = get_netdev_queue_index(queue); ret = sysfs_rtnl_lock(kobj, attr, queue->dev); if (ret) return ret; /* If queue belongs to subordinate dev use its map */ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; tc = netdev_txq_to_tc(dev, index); if (tc < 0) { rtnl_unlock(); return -EINVAL; } /* Increase the net device refcnt to make sure it won't be freed while * xps_queue_show is running. */ dev_hold(dev); rtnl_unlock(); len = xps_queue_show(dev, index, tc, buf, XPS_CPUS); dev_put(dev); return len; } static ssize_t xps_cpus_store(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, const char *buf, size_t len) { struct net_device *dev = queue->dev; unsigned int index; cpumask_var_t mask; int err; if (!netif_is_multiqueue(dev)) return -ENOENT; if (!capable(CAP_NET_ADMIN)) return -EPERM; if (!alloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; index = get_netdev_queue_index(queue); err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits); if (err) { free_cpumask_var(mask); return err; } err = sysfs_rtnl_lock(kobj, attr, dev); if (err) { free_cpumask_var(mask); return err; } err = netif_set_xps_queue(dev, mask, index); rtnl_unlock(); free_cpumask_var(mask); return err ? : len; } static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init = __ATTR_RW(xps_cpus); static ssize_t xps_rxqs_show(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; unsigned int index; int tc, ret; index = get_netdev_queue_index(queue); ret = sysfs_rtnl_lock(kobj, attr, dev); if (ret) return ret; tc = netdev_txq_to_tc(dev, index); /* Increase the net device refcnt to make sure it won't be freed while * xps_queue_show is running. */ dev_hold(dev); rtnl_unlock(); ret = tc >= 0 ? xps_queue_show(dev, index, tc, buf, XPS_RXQS) : -EINVAL; dev_put(dev); return ret; } static ssize_t xps_rxqs_store(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, const char *buf, size_t len) { struct net_device *dev = queue->dev; struct net *net = dev_net(dev); unsigned long *mask; unsigned int index; int err; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; mask = bitmap_zalloc(dev->num_rx_queues, GFP_KERNEL); if (!mask) return -ENOMEM; index = get_netdev_queue_index(queue); err = bitmap_parse(buf, len, mask, dev->num_rx_queues); if (err) { bitmap_free(mask); return err; } err = sysfs_rtnl_lock(kobj, attr, dev); if (err) { bitmap_free(mask); return err; } cpus_read_lock(); err = __netif_set_xps_queue(dev, mask, index, XPS_RXQS); cpus_read_unlock(); rtnl_unlock(); bitmap_free(mask); return err ? : len; } static struct netdev_queue_attribute xps_rxqs_attribute __ro_after_init = __ATTR_RW(xps_rxqs); #endif /* CONFIG_XPS */ static struct attribute *netdev_queue_default_attrs[] __ro_after_init = { &queue_trans_timeout.attr, &queue_traffic_class.attr, #ifdef CONFIG_XPS &xps_cpus_attribute.attr, &xps_rxqs_attribute.attr, &queue_tx_maxrate.attr, #endif NULL }; ATTRIBUTE_GROUPS(netdev_queue_default); static void netdev_queue_release(struct kobject *kobj) { struct netdev_queue *queue = to_netdev_queue(kobj); memset(kobj, 0, sizeof(*kobj)); netdev_put(queue->dev, &queue->dev_tracker); } static const void *netdev_queue_namespace(const struct kobject *kobj) { struct netdev_queue *queue = to_netdev_queue(kobj); struct device *dev = &queue->dev->dev; const void *ns = NULL; if (dev->class && dev->class->namespace) ns = dev->class->namespace(dev); return ns; } static void netdev_queue_get_ownership(const struct kobject *kobj, kuid_t *uid, kgid_t *gid) { const struct net *net = netdev_queue_namespace(kobj); net_ns_get_ownership(net, uid, gid); } static const struct kobj_type netdev_queue_ktype = { .sysfs_ops = &netdev_queue_sysfs_ops, .release = netdev_queue_release, .namespace = netdev_queue_namespace, .get_ownership = netdev_queue_get_ownership, }; static bool netdev_uses_bql(const struct net_device *dev) { if (dev->lltx || (dev->priv_flags & IFF_NO_QUEUE)) return false; return IS_ENABLED(CONFIG_BQL); } static int netdev_queue_add_kobject(struct net_device *dev, int index) { struct netdev_queue *queue = dev->_tx + index; struct kobject *kobj = &queue->kobj; int error = 0; /* Tx queues are cleared in netdev_queue_release to allow later * re-registration. This is triggered when their kobj refcount is * dropped. * * If a queue is removed while both a read (or write) operation and a * the re-addition of the same queue are pending (waiting on rntl_lock) * it might happen that the re-addition will execute before the read, * making the initial removal to never happen (queue's kobj refcount * won't drop enough because of the pending read). In such rare case, * return to allow the removal operation to complete. */ if (unlikely(kobj->state_initialized)) { netdev_warn_once(dev, "Cannot re-add tx queues before their removal completed"); return -EAGAIN; } /* Kobject_put later will trigger netdev_queue_release call * which decreases dev refcount: Take that reference here */ netdev_hold(queue->dev, &queue->dev_tracker, GFP_KERNEL); kobj->kset = dev->queues_kset; error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL, "tx-%u", index); if (error) goto err; queue->groups = netdev_queue_default_groups; error = sysfs_create_groups(kobj, queue->groups); if (error) goto err; if (netdev_uses_bql(dev)) { error = sysfs_create_group(kobj, &dql_group); if (error) goto err_default_groups; } kobject_uevent(kobj, KOBJ_ADD); return 0; err_default_groups: sysfs_remove_groups(kobj, queue->groups); err: kobject_put(kobj); return error; } static int tx_queue_change_owner(struct net_device *ndev, int index, kuid_t kuid, kgid_t kgid) { struct netdev_queue *queue = ndev->_tx + index; struct kobject *kobj = &queue->kobj; int error; error = sysfs_change_owner(kobj, kuid, kgid); if (error) return error; if (netdev_uses_bql(ndev)) error = sysfs_group_change_owner(kobj, &dql_group, kuid, kgid); return error; } #endif /* CONFIG_SYSFS */ int netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) { #ifdef CONFIG_SYSFS int i; int error = 0; /* Tx queue kobjects are allowed to be updated when a device is being * unregistered, but solely to remove queues from qdiscs. Any path * adding queues should be fixed. */ WARN(dev->reg_state == NETREG_UNREGISTERING && new_num > old_num, "New queues can't be registered after device unregistration."); for (i = old_num; i < new_num; i++) { error = netdev_queue_add_kobject(dev, i); if (error) { new_num = old_num; break; } } while (--i >= new_num) { struct netdev_queue *queue = dev->_tx + i; if (!refcount_read(&dev_net(dev)->ns.count)) queue->kobj.uevent_suppress = 1; if (netdev_uses_bql(dev)) sysfs_remove_group(&queue->kobj, &dql_group); sysfs_remove_groups(&queue->kobj, queue->groups); kobject_put(&queue->kobj); } return error; #else return 0; #endif /* CONFIG_SYSFS */ } static int net_tx_queue_change_owner(struct net_device *dev, int num, kuid_t kuid, kgid_t kgid) { #ifdef CONFIG_SYSFS int error = 0; int i; for (i = 0; i < num; i++) { error = tx_queue_change_owner(dev, i, kuid, kgid); if (error) break; } return error; #else return 0; #endif /* CONFIG_SYSFS */ } static int register_queue_kobjects(struct net_device *dev) { int error = 0, txq = 0, rxq = 0, real_rx = 0, real_tx = 0; #ifdef CONFIG_SYSFS dev->queues_kset = kset_create_and_add("queues", NULL, &dev->dev.kobj); if (!dev->queues_kset) return -ENOMEM; real_rx = dev->real_num_rx_queues; #endif real_tx = dev->real_num_tx_queues; error = net_rx_queue_update_kobjects(dev, 0, real_rx); if (error) goto error; rxq = real_rx; error = netdev_queue_update_kobjects(dev, 0, real_tx); if (error) goto error; txq = real_tx; return 0; error: netdev_queue_update_kobjects(dev, txq, 0); net_rx_queue_update_kobjects(dev, rxq, 0); #ifdef CONFIG_SYSFS kset_unregister(dev->queues_kset); #endif return error; } static int queue_change_owner(struct net_device *ndev, kuid_t kuid, kgid_t kgid) { int error = 0, real_rx = 0, real_tx = 0; #ifdef CONFIG_SYSFS if (ndev->queues_kset) { error = sysfs_change_owner(&ndev->queues_kset->kobj, kuid, kgid); if (error) return error; } real_rx = ndev->real_num_rx_queues; #endif real_tx = ndev->real_num_tx_queues; error = net_rx_queue_change_owner(ndev, real_rx, kuid, kgid); if (error) return error; error = net_tx_queue_change_owner(ndev, real_tx, kuid, kgid); if (error) return error; return 0; } static void remove_queue_kobjects(struct net_device *dev) { int real_rx = 0, real_tx = 0; #ifdef CONFIG_SYSFS real_rx = dev->real_num_rx_queues; #endif real_tx = dev->real_num_tx_queues; net_rx_queue_update_kobjects(dev, real_rx, 0); netdev_queue_update_kobjects(dev, real_tx, 0); netdev_lock_ops(dev); dev->real_num_rx_queues = 0; dev->real_num_tx_queues = 0; netdev_unlock_ops(dev); #ifdef CONFIG_SYSFS kset_unregister(dev->queues_kset); #endif } static bool net_current_may_mount(void) { struct net *net = current->nsproxy->net_ns; return ns_capable(net->user_ns, CAP_SYS_ADMIN); } static void *net_grab_current_ns(void) { struct net *ns = current->nsproxy->net_ns; #ifdef CONFIG_NET_NS if (ns) refcount_inc(&ns->passive); #endif return ns; } static const void *net_initial_ns(void) { return &init_net; } static const void *net_netlink_ns(struct sock *sk) { return sock_net(sk); } const struct kobj_ns_type_operations net_ns_type_operations = { .type = KOBJ_NS_TYPE_NET, .current_may_mount = net_current_may_mount, .grab_current_ns = net_grab_current_ns, .netlink_ns = net_netlink_ns, .initial_ns = net_initial_ns, .drop_ns = net_drop_ns, }; EXPORT_SYMBOL_GPL(net_ns_type_operations); static int netdev_uevent(const struct device *d, struct kobj_uevent_env *env) { const struct net_device *dev = to_net_dev(d); int retval; /* pass interface to uevent. */ retval = add_uevent_var(env, "INTERFACE=%s", dev->name); if (retval) goto exit; /* pass ifindex to uevent. * ifindex is useful as it won't change (interface name may change) * and is what RtNetlink uses natively. */ retval = add_uevent_var(env, "IFINDEX=%d", dev->ifindex); exit: return retval; } /* * netdev_release -- destroy and free a dead device. * Called when last reference to device kobject is gone. */ static void netdev_release(struct device *d) { struct net_device *dev = to_net_dev(d); BUG_ON(dev->reg_state != NETREG_RELEASED); /* no need to wait for rcu grace period: * device is dead and about to be freed. */ kfree(rcu_access_pointer(dev->ifalias)); kvfree(dev); } static const void *net_namespace(const struct device *d) { const struct net_device *dev = to_net_dev(d); return dev_net(dev); } static void net_get_ownership(const struct device *d, kuid_t *uid, kgid_t *gid) { const struct net_device *dev = to_net_dev(d); const struct net *net = dev_net(dev); net_ns_get_ownership(net, uid, gid); } static const struct class net_class = { .name = "net", .dev_release = netdev_release, .dev_groups = net_class_groups, .dev_uevent = netdev_uevent, .ns_type = &net_ns_type_operations, .namespace = net_namespace, .get_ownership = net_get_ownership, }; #ifdef CONFIG_OF static int of_dev_node_match(struct device *dev, const void *data) { for (; dev; dev = dev->parent) { if (dev->of_node == data) return 1; } return 0; } /* * of_find_net_device_by_node - lookup the net device for the device node * @np: OF device node * * Looks up the net_device structure corresponding with the device node. * If successful, returns a pointer to the net_device with the embedded * struct device refcount incremented by one, or NULL on failure. The * refcount must be dropped when done with the net_device. */ struct net_device *of_find_net_device_by_node(struct device_node *np) { struct device *dev; dev = class_find_device(&net_class, NULL, np, of_dev_node_match); if (!dev) return NULL; return to_net_dev(dev); } EXPORT_SYMBOL(of_find_net_device_by_node); #endif /* Delete sysfs entries but hold kobject reference until after all * netdev references are gone. */ void netdev_unregister_kobject(struct net_device *ndev) { struct device *dev = &ndev->dev; if (!refcount_read(&dev_net(ndev)->ns.count)) dev_set_uevent_suppress(dev, 1); kobject_get(&dev->kobj); remove_queue_kobjects(ndev); pm_runtime_set_memalloc_noio(dev, false); device_del(dev); } /* Create sysfs entries for network device. */ int netdev_register_kobject(struct net_device *ndev) { struct device *dev = &ndev->dev; const struct attribute_group **groups = ndev->sysfs_groups; int error = 0; device_initialize(dev); dev->class = &net_class; dev->platform_data = ndev; dev->groups = groups; dev_set_name(dev, "%s", ndev->name); #ifdef CONFIG_SYSFS /* Allow for a device specific group */ if (*groups) groups++; *groups++ = &netstat_group; if (wireless_group_needed(ndev)) *groups++ = &wireless_group; #endif /* CONFIG_SYSFS */ error = device_add(dev); if (error) return error; error = register_queue_kobjects(ndev); if (error) { device_del(dev); return error; } pm_runtime_set_memalloc_noio(dev, true); return error; } /* Change owner for sysfs entries when moving network devices across network * namespaces owned by different user namespaces. */ int netdev_change_owner(struct net_device *ndev, const struct net *net_old, const struct net *net_new) { kuid_t old_uid = GLOBAL_ROOT_UID, new_uid = GLOBAL_ROOT_UID; kgid_t old_gid = GLOBAL_ROOT_GID, new_gid = GLOBAL_ROOT_GID; struct device *dev = &ndev->dev; int error; net_ns_get_ownership(net_old, &old_uid, &old_gid); net_ns_get_ownership(net_new, &new_uid, &new_gid); /* The network namespace was changed but the owning user namespace is * identical so there's no need to change the owner of sysfs entries. */ if (uid_eq(old_uid, new_uid) && gid_eq(old_gid, new_gid)) return 0; error = device_change_owner(dev, new_uid, new_gid); if (error) return error; error = queue_change_owner(ndev, new_uid, new_gid); if (error) return error; return 0; } int netdev_class_create_file_ns(const struct class_attribute *class_attr, const void *ns) { return class_create_file_ns(&net_class, class_attr, ns); } EXPORT_SYMBOL(netdev_class_create_file_ns); void netdev_class_remove_file_ns(const struct class_attribute *class_attr, const void *ns) { class_remove_file_ns(&net_class, class_attr, ns); } EXPORT_SYMBOL(netdev_class_remove_file_ns); int __init netdev_kobject_init(void) { kobj_ns_type_register(&net_ns_type_operations); return class_register(&net_class); }
1 1 1 1 1 3 3 1 2 3 3 18 4 1 3 17 12 5 5 4 1 3 1 3 3 1 1 1 6 1 5 5 5 2 3 3 5 5 5 5 15 2 1 13 2 1 1 1 1 6 3 5 3 1 2 1 1 1 1 1 1 7 1 6 8 1 1 1 1 1 1 2 1 3 6 3 3 3 3 2190 2142 131 72 9 5 5 17 17 17 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) /* isotp.c - ISO 15765-2 CAN transport protocol for protocol family CAN * * This implementation does not provide ISO-TP specific return values to the * userspace. * * - RX path timeout of data reception leads to -ETIMEDOUT * - RX path SN mismatch leads to -EILSEQ * - RX path data reception with wrong padding leads to -EBADMSG * - TX path flowcontrol reception timeout leads to -ECOMM * - TX path flowcontrol reception overflow leads to -EMSGSIZE * - TX path flowcontrol reception with wrong layout/padding leads to -EBADMSG * - when a transfer (tx) is on the run the next write() blocks until it's done * - use CAN_ISOTP_WAIT_TX_DONE flag to block the caller until the PDU is sent * - as we have static buffers the check whether the PDU fits into the buffer * is done at FF reception time (no support for sending 'wait frames') * * Copyright (c) 2020 Volkswagen Group Electronic Research * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Volkswagen nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * Alternatively, provided that this notice is retained in full, this * software may be distributed under the terms of the GNU General * Public License ("GPL") version 2, in which case the provisions of the * GPL apply INSTEAD OF those given above. * * The provided data structures and external interfaces from this code * are not restricted to be used by modules with a GPL compatible license. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ #include <linux/module.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/spinlock.h> #include <linux/hrtimer.h> #include <linux/wait.h> #include <linux/uio.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/socket.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/can.h> #include <linux/can/core.h> #include <linux/can/skb.h> #include <linux/can/isotp.h> #include <linux/slab.h> #include <net/sock.h> #include <net/net_namespace.h> MODULE_DESCRIPTION("PF_CAN ISO 15765-2 transport protocol"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Oliver Hartkopp <socketcan@hartkopp.net>"); MODULE_ALIAS("can-proto-6"); #define ISOTP_MIN_NAMELEN CAN_REQUIRED_SIZE(struct sockaddr_can, can_addr.tp) #define SINGLE_MASK(id) (((id) & CAN_EFF_FLAG) ? \ (CAN_EFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG) : \ (CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG)) /* Since ISO 15765-2:2016 the CAN isotp protocol supports more than 4095 * byte per ISO PDU as the FF_DL can take full 32 bit values (4 Gbyte). * We would need some good concept to handle this between user space and * kernel space. For now set the static buffer to something about 8 kbyte * to be able to test this new functionality. */ #define DEFAULT_MAX_PDU_SIZE 8300 /* maximum PDU size before ISO 15765-2:2016 extension was 4095 */ #define MAX_12BIT_PDU_SIZE 4095 /* limit the isotp pdu size from the optional module parameter to 1MByte */ #define MAX_PDU_SIZE (1025 * 1024U) static unsigned int max_pdu_size __read_mostly = DEFAULT_MAX_PDU_SIZE; module_param(max_pdu_size, uint, 0444); MODULE_PARM_DESC(max_pdu_size, "maximum isotp pdu size (default " __stringify(DEFAULT_MAX_PDU_SIZE) ")"); /* N_PCI type values in bits 7-4 of N_PCI bytes */ #define N_PCI_SF 0x00 /* single frame */ #define N_PCI_FF 0x10 /* first frame */ #define N_PCI_CF 0x20 /* consecutive frame */ #define N_PCI_FC 0x30 /* flow control */ #define N_PCI_SZ 1 /* size of the PCI byte #1 */ #define SF_PCI_SZ4 1 /* size of SingleFrame PCI including 4 bit SF_DL */ #define SF_PCI_SZ8 2 /* size of SingleFrame PCI including 8 bit SF_DL */ #define FF_PCI_SZ12 2 /* size of FirstFrame PCI including 12 bit FF_DL */ #define FF_PCI_SZ32 6 /* size of FirstFrame PCI including 32 bit FF_DL */ #define FC_CONTENT_SZ 3 /* flow control content size in byte (FS/BS/STmin) */ #define ISOTP_CHECK_PADDING (CAN_ISOTP_CHK_PAD_LEN | CAN_ISOTP_CHK_PAD_DATA) #define ISOTP_ALL_BC_FLAGS (CAN_ISOTP_SF_BROADCAST | CAN_ISOTP_CF_BROADCAST) /* Flow Status given in FC frame */ #define ISOTP_FC_CTS 0 /* clear to send */ #define ISOTP_FC_WT 1 /* wait */ #define ISOTP_FC_OVFLW 2 /* overflow */ #define ISOTP_FC_TIMEOUT 1 /* 1 sec */ #define ISOTP_ECHO_TIMEOUT 2 /* 2 secs */ enum { ISOTP_IDLE = 0, ISOTP_WAIT_FIRST_FC, ISOTP_WAIT_FC, ISOTP_WAIT_DATA, ISOTP_SENDING, ISOTP_SHUTDOWN, }; struct tpcon { u8 *buf; unsigned int buflen; unsigned int len; unsigned int idx; u32 state; u8 bs; u8 sn; u8 ll_dl; u8 sbuf[DEFAULT_MAX_PDU_SIZE]; }; struct isotp_sock { struct sock sk; int bound; int ifindex; canid_t txid; canid_t rxid; ktime_t tx_gap; ktime_t lastrxcf_tstamp; struct hrtimer rxtimer, txtimer, txfrtimer; struct can_isotp_options opt; struct can_isotp_fc_options rxfc, txfc; struct can_isotp_ll_options ll; u32 frame_txtime; u32 force_tx_stmin; u32 force_rx_stmin; u32 cfecho; /* consecutive frame echo tag */ struct tpcon rx, tx; struct list_head notifier; wait_queue_head_t wait; spinlock_t rx_lock; /* protect single thread state machine */ }; static LIST_HEAD(isotp_notifier_list); static DEFINE_SPINLOCK(isotp_notifier_lock); static struct isotp_sock *isotp_busy_notifier; static inline struct isotp_sock *isotp_sk(const struct sock *sk) { return (struct isotp_sock *)sk; } static u32 isotp_bc_flags(struct isotp_sock *so) { return so->opt.flags & ISOTP_ALL_BC_FLAGS; } static bool isotp_register_rxid(struct isotp_sock *so) { /* no broadcast modes => register rx_id for FC frame reception */ return (isotp_bc_flags(so) == 0); } static enum hrtimer_restart isotp_rx_timer_handler(struct hrtimer *hrtimer) { struct isotp_sock *so = container_of(hrtimer, struct isotp_sock, rxtimer); struct sock *sk = &so->sk; if (so->rx.state == ISOTP_WAIT_DATA) { /* we did not get new data frames in time */ /* report 'connection timed out' */ sk->sk_err = ETIMEDOUT; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); /* reset rx state */ so->rx.state = ISOTP_IDLE; } return HRTIMER_NORESTART; } static int isotp_send_fc(struct sock *sk, int ae, u8 flowstatus) { struct net_device *dev; struct sk_buff *nskb; struct canfd_frame *ncf; struct isotp_sock *so = isotp_sk(sk); int can_send_ret; nskb = alloc_skb(so->ll.mtu + sizeof(struct can_skb_priv), gfp_any()); if (!nskb) return 1; dev = dev_get_by_index(sock_net(sk), so->ifindex); if (!dev) { kfree_skb(nskb); return 1; } can_skb_reserve(nskb); can_skb_prv(nskb)->ifindex = dev->ifindex; can_skb_prv(nskb)->skbcnt = 0; nskb->dev = dev; can_skb_set_owner(nskb, sk); ncf = (struct canfd_frame *)nskb->data; skb_put_zero(nskb, so->ll.mtu); /* create & send flow control reply */ ncf->can_id = so->txid; if (so->opt.flags & CAN_ISOTP_TX_PADDING) { memset(ncf->data, so->opt.txpad_content, CAN_MAX_DLEN); ncf->len = CAN_MAX_DLEN; } else { ncf->len = ae + FC_CONTENT_SZ; } ncf->data[ae] = N_PCI_FC | flowstatus; ncf->data[ae + 1] = so->rxfc.bs; ncf->data[ae + 2] = so->rxfc.stmin; if (ae) ncf->data[0] = so->opt.ext_address; ncf->flags = so->ll.tx_flags; can_send_ret = can_send(nskb, 1); if (can_send_ret) pr_notice_once("can-isotp: %s: can_send_ret %pe\n", __func__, ERR_PTR(can_send_ret)); dev_put(dev); /* reset blocksize counter */ so->rx.bs = 0; /* reset last CF frame rx timestamp for rx stmin enforcement */ so->lastrxcf_tstamp = ktime_set(0, 0); /* start rx timeout watchdog */ hrtimer_start(&so->rxtimer, ktime_set(ISOTP_FC_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); return 0; } static void isotp_rcv_skb(struct sk_buff *skb, struct sock *sk) { struct sockaddr_can *addr = (struct sockaddr_can *)skb->cb; BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct sockaddr_can)); memset(addr, 0, sizeof(*addr)); addr->can_family = AF_CAN; addr->can_ifindex = skb->dev->ifindex; if (sock_queue_rcv_skb(sk, skb) < 0) kfree_skb(skb); } static u8 padlen(u8 datalen) { static const u8 plen[] = { 8, 8, 8, 8, 8, 8, 8, 8, 8, /* 0 - 8 */ 12, 12, 12, 12, /* 9 - 12 */ 16, 16, 16, 16, /* 13 - 16 */ 20, 20, 20, 20, /* 17 - 20 */ 24, 24, 24, 24, /* 21 - 24 */ 32, 32, 32, 32, 32, 32, 32, 32, /* 25 - 32 */ 48, 48, 48, 48, 48, 48, 48, 48, /* 33 - 40 */ 48, 48, 48, 48, 48, 48, 48, 48 /* 41 - 48 */ }; if (datalen > 48) return 64; return plen[datalen]; } /* check for length optimization and return 1/true when the check fails */ static int check_optimized(struct canfd_frame *cf, int start_index) { /* for CAN_DL <= 8 the start_index is equal to the CAN_DL as the * padding would start at this point. E.g. if the padding would * start at cf.data[7] cf->len has to be 7 to be optimal. * Note: The data[] index starts with zero. */ if (cf->len <= CAN_MAX_DLEN) return (cf->len != start_index); /* This relation is also valid in the non-linear DLC range, where * we need to take care of the minimal next possible CAN_DL. * The correct check would be (padlen(cf->len) != padlen(start_index)). * But as cf->len can only take discrete values from 12, .., 64 at this * point the padlen(cf->len) is always equal to cf->len. */ return (cf->len != padlen(start_index)); } /* check padding and return 1/true when the check fails */ static int check_pad(struct isotp_sock *so, struct canfd_frame *cf, int start_index, u8 content) { int i; /* no RX_PADDING value => check length of optimized frame length */ if (!(so->opt.flags & CAN_ISOTP_RX_PADDING)) { if (so->opt.flags & CAN_ISOTP_CHK_PAD_LEN) return check_optimized(cf, start_index); /* no valid test against empty value => ignore frame */ return 1; } /* check datalength of correctly padded CAN frame */ if ((so->opt.flags & CAN_ISOTP_CHK_PAD_LEN) && cf->len != padlen(cf->len)) return 1; /* check padding content */ if (so->opt.flags & CAN_ISOTP_CHK_PAD_DATA) { for (i = start_index; i < cf->len; i++) if (cf->data[i] != content) return 1; } return 0; } static void isotp_send_cframe(struct isotp_sock *so); static int isotp_rcv_fc(struct isotp_sock *so, struct canfd_frame *cf, int ae) { struct sock *sk = &so->sk; if (so->tx.state != ISOTP_WAIT_FC && so->tx.state != ISOTP_WAIT_FIRST_FC) return 0; hrtimer_cancel(&so->txtimer); if ((cf->len < ae + FC_CONTENT_SZ) || ((so->opt.flags & ISOTP_CHECK_PADDING) && check_pad(so, cf, ae + FC_CONTENT_SZ, so->opt.rxpad_content))) { /* malformed PDU - report 'not a data message' */ sk->sk_err = EBADMSG; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); so->tx.state = ISOTP_IDLE; wake_up_interruptible(&so->wait); return 1; } /* get static/dynamic communication params from first/every FC frame */ if (so->tx.state == ISOTP_WAIT_FIRST_FC || so->opt.flags & CAN_ISOTP_DYN_FC_PARMS) { so->txfc.bs = cf->data[ae + 1]; so->txfc.stmin = cf->data[ae + 2]; /* fix wrong STmin values according spec */ if (so->txfc.stmin > 0x7F && (so->txfc.stmin < 0xF1 || so->txfc.stmin > 0xF9)) so->txfc.stmin = 0x7F; so->tx_gap = ktime_set(0, 0); /* add transmission time for CAN frame N_As */ so->tx_gap = ktime_add_ns(so->tx_gap, so->frame_txtime); /* add waiting time for consecutive frames N_Cs */ if (so->opt.flags & CAN_ISOTP_FORCE_TXSTMIN) so->tx_gap = ktime_add_ns(so->tx_gap, so->force_tx_stmin); else if (so->txfc.stmin < 0x80) so->tx_gap = ktime_add_ns(so->tx_gap, so->txfc.stmin * 1000000); else so->tx_gap = ktime_add_ns(so->tx_gap, (so->txfc.stmin - 0xF0) * 100000); so->tx.state = ISOTP_WAIT_FC; } switch (cf->data[ae] & 0x0F) { case ISOTP_FC_CTS: so->tx.bs = 0; so->tx.state = ISOTP_SENDING; /* send CF frame and enable echo timeout handling */ hrtimer_start(&so->txtimer, ktime_set(ISOTP_ECHO_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); isotp_send_cframe(so); break; case ISOTP_FC_WT: /* start timer to wait for next FC frame */ hrtimer_start(&so->txtimer, ktime_set(ISOTP_FC_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); break; case ISOTP_FC_OVFLW: /* overflow on receiver side - report 'message too long' */ sk->sk_err = EMSGSIZE; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); fallthrough; default: /* stop this tx job */ so->tx.state = ISOTP_IDLE; wake_up_interruptible(&so->wait); } return 0; } static int isotp_rcv_sf(struct sock *sk, struct canfd_frame *cf, int pcilen, struct sk_buff *skb, int len) { struct isotp_sock *so = isotp_sk(sk); struct sk_buff *nskb; hrtimer_cancel(&so->rxtimer); so->rx.state = ISOTP_IDLE; if (!len || len > cf->len - pcilen) return 1; if ((so->opt.flags & ISOTP_CHECK_PADDING) && check_pad(so, cf, pcilen + len, so->opt.rxpad_content)) { /* malformed PDU - report 'not a data message' */ sk->sk_err = EBADMSG; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); return 1; } nskb = alloc_skb(len, gfp_any()); if (!nskb) return 1; memcpy(skb_put(nskb, len), &cf->data[pcilen], len); nskb->tstamp = skb->tstamp; nskb->dev = skb->dev; isotp_rcv_skb(nskb, sk); return 0; } static int isotp_rcv_ff(struct sock *sk, struct canfd_frame *cf, int ae) { struct isotp_sock *so = isotp_sk(sk); int i; int off; int ff_pci_sz; hrtimer_cancel(&so->rxtimer); so->rx.state = ISOTP_IDLE; /* get the used sender LL_DL from the (first) CAN frame data length */ so->rx.ll_dl = padlen(cf->len); /* the first frame has to use the entire frame up to LL_DL length */ if (cf->len != so->rx.ll_dl) return 1; /* get the FF_DL */ so->rx.len = (cf->data[ae] & 0x0F) << 8; so->rx.len += cf->data[ae + 1]; /* Check for FF_DL escape sequence supporting 32 bit PDU length */ if (so->rx.len) { ff_pci_sz = FF_PCI_SZ12; } else { /* FF_DL = 0 => get real length from next 4 bytes */ so->rx.len = cf->data[ae + 2] << 24; so->rx.len += cf->data[ae + 3] << 16; so->rx.len += cf->data[ae + 4] << 8; so->rx.len += cf->data[ae + 5]; ff_pci_sz = FF_PCI_SZ32; } /* take care of a potential SF_DL ESC offset for TX_DL > 8 */ off = (so->rx.ll_dl > CAN_MAX_DLEN) ? 1 : 0; if (so->rx.len + ae + off + ff_pci_sz < so->rx.ll_dl) return 1; /* PDU size > default => try max_pdu_size */ if (so->rx.len > so->rx.buflen && so->rx.buflen < max_pdu_size) { u8 *newbuf = kmalloc(max_pdu_size, GFP_ATOMIC); if (newbuf) { so->rx.buf = newbuf; so->rx.buflen = max_pdu_size; } } if (so->rx.len > so->rx.buflen) { /* send FC frame with overflow status */ isotp_send_fc(sk, ae, ISOTP_FC_OVFLW); return 1; } /* copy the first received data bytes */ so->rx.idx = 0; for (i = ae + ff_pci_sz; i < so->rx.ll_dl; i++) so->rx.buf[so->rx.idx++] = cf->data[i]; /* initial setup for this pdu reception */ so->rx.sn = 1; so->rx.state = ISOTP_WAIT_DATA; /* no creation of flow control frames */ if (so->opt.flags & CAN_ISOTP_LISTEN_MODE) return 0; /* send our first FC frame */ isotp_send_fc(sk, ae, ISOTP_FC_CTS); return 0; } static int isotp_rcv_cf(struct sock *sk, struct canfd_frame *cf, int ae, struct sk_buff *skb) { struct isotp_sock *so = isotp_sk(sk); struct sk_buff *nskb; int i; if (so->rx.state != ISOTP_WAIT_DATA) return 0; /* drop if timestamp gap is less than force_rx_stmin nano secs */ if (so->opt.flags & CAN_ISOTP_FORCE_RXSTMIN) { if (ktime_to_ns(ktime_sub(skb->tstamp, so->lastrxcf_tstamp)) < so->force_rx_stmin) return 0; so->lastrxcf_tstamp = skb->tstamp; } hrtimer_cancel(&so->rxtimer); /* CFs are never longer than the FF */ if (cf->len > so->rx.ll_dl) return 1; /* CFs have usually the LL_DL length */ if (cf->len < so->rx.ll_dl) { /* this is only allowed for the last CF */ if (so->rx.len - so->rx.idx > so->rx.ll_dl - ae - N_PCI_SZ) return 1; } if ((cf->data[ae] & 0x0F) != so->rx.sn) { /* wrong sn detected - report 'illegal byte sequence' */ sk->sk_err = EILSEQ; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); /* reset rx state */ so->rx.state = ISOTP_IDLE; return 1; } so->rx.sn++; so->rx.sn %= 16; for (i = ae + N_PCI_SZ; i < cf->len; i++) { so->rx.buf[so->rx.idx++] = cf->data[i]; if (so->rx.idx >= so->rx.len) break; } if (so->rx.idx >= so->rx.len) { /* we are done */ so->rx.state = ISOTP_IDLE; if ((so->opt.flags & ISOTP_CHECK_PADDING) && check_pad(so, cf, i + 1, so->opt.rxpad_content)) { /* malformed PDU - report 'not a data message' */ sk->sk_err = EBADMSG; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); return 1; } nskb = alloc_skb(so->rx.len, gfp_any()); if (!nskb) return 1; memcpy(skb_put(nskb, so->rx.len), so->rx.buf, so->rx.len); nskb->tstamp = skb->tstamp; nskb->dev = skb->dev; isotp_rcv_skb(nskb, sk); return 0; } /* perform blocksize handling, if enabled */ if (!so->rxfc.bs || ++so->rx.bs < so->rxfc.bs) { /* start rx timeout watchdog */ hrtimer_start(&so->rxtimer, ktime_set(ISOTP_FC_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); return 0; } /* no creation of flow control frames */ if (so->opt.flags & CAN_ISOTP_LISTEN_MODE) return 0; /* we reached the specified blocksize so->rxfc.bs */ isotp_send_fc(sk, ae, ISOTP_FC_CTS); return 0; } static void isotp_rcv(struct sk_buff *skb, void *data) { struct sock *sk = (struct sock *)data; struct isotp_sock *so = isotp_sk(sk); struct canfd_frame *cf; int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0; u8 n_pci_type, sf_dl; /* Strictly receive only frames with the configured MTU size * => clear separation of CAN2.0 / CAN FD transport channels */ if (skb->len != so->ll.mtu) return; cf = (struct canfd_frame *)skb->data; /* if enabled: check reception of my configured extended address */ if (ae && cf->data[0] != so->opt.rx_ext_address) return; n_pci_type = cf->data[ae] & 0xF0; /* Make sure the state changes and data structures stay consistent at * CAN frame reception time. This locking is not needed in real world * use cases but the inconsistency can be triggered with syzkaller. */ spin_lock(&so->rx_lock); if (so->opt.flags & CAN_ISOTP_HALF_DUPLEX) { /* check rx/tx path half duplex expectations */ if ((so->tx.state != ISOTP_IDLE && n_pci_type != N_PCI_FC) || (so->rx.state != ISOTP_IDLE && n_pci_type == N_PCI_FC)) goto out_unlock; } switch (n_pci_type) { case N_PCI_FC: /* tx path: flow control frame containing the FC parameters */ isotp_rcv_fc(so, cf, ae); break; case N_PCI_SF: /* rx path: single frame * * As we do not have a rx.ll_dl configuration, we can only test * if the CAN frames payload length matches the LL_DL == 8 * requirements - no matter if it's CAN 2.0 or CAN FD */ /* get the SF_DL from the N_PCI byte */ sf_dl = cf->data[ae] & 0x0F; if (cf->len <= CAN_MAX_DLEN) { isotp_rcv_sf(sk, cf, SF_PCI_SZ4 + ae, skb, sf_dl); } else { if (can_is_canfd_skb(skb)) { /* We have a CAN FD frame and CAN_DL is greater than 8: * Only frames with the SF_DL == 0 ESC value are valid. * * If so take care of the increased SF PCI size * (SF_PCI_SZ8) to point to the message content behind * the extended SF PCI info and get the real SF_DL * length value from the formerly first data byte. */ if (sf_dl == 0) isotp_rcv_sf(sk, cf, SF_PCI_SZ8 + ae, skb, cf->data[SF_PCI_SZ4 + ae]); } } break; case N_PCI_FF: /* rx path: first frame */ isotp_rcv_ff(sk, cf, ae); break; case N_PCI_CF: /* rx path: consecutive frame */ isotp_rcv_cf(sk, cf, ae, skb); break; } out_unlock: spin_unlock(&so->rx_lock); } static void isotp_fill_dataframe(struct canfd_frame *cf, struct isotp_sock *so, int ae, int off) { int pcilen = N_PCI_SZ + ae + off; int space = so->tx.ll_dl - pcilen; int num = min_t(int, so->tx.len - so->tx.idx, space); int i; cf->can_id = so->txid; cf->len = num + pcilen; if (num < space) { if (so->opt.flags & CAN_ISOTP_TX_PADDING) { /* user requested padding */ cf->len = padlen(cf->len); memset(cf->data, so->opt.txpad_content, cf->len); } else if (cf->len > CAN_MAX_DLEN) { /* mandatory padding for CAN FD frames */ cf->len = padlen(cf->len); memset(cf->data, CAN_ISOTP_DEFAULT_PAD_CONTENT, cf->len); } } for (i = 0; i < num; i++) cf->data[pcilen + i] = so->tx.buf[so->tx.idx++]; if (ae) cf->data[0] = so->opt.ext_address; } static void isotp_send_cframe(struct isotp_sock *so) { struct sock *sk = &so->sk; struct sk_buff *skb; struct net_device *dev; struct canfd_frame *cf; int can_send_ret; int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0; dev = dev_get_by_index(sock_net(sk), so->ifindex); if (!dev) return; skb = alloc_skb(so->ll.mtu + sizeof(struct can_skb_priv), GFP_ATOMIC); if (!skb) { dev_put(dev); return; } can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; can_skb_prv(skb)->skbcnt = 0; cf = (struct canfd_frame *)skb->data; skb_put_zero(skb, so->ll.mtu); /* create consecutive frame */ isotp_fill_dataframe(cf, so, ae, 0); /* place consecutive frame N_PCI in appropriate index */ cf->data[ae] = N_PCI_CF | so->tx.sn++; so->tx.sn %= 16; so->tx.bs++; cf->flags = so->ll.tx_flags; skb->dev = dev; can_skb_set_owner(skb, sk); /* cfecho should have been zero'ed by init/isotp_rcv_echo() */ if (so->cfecho) pr_notice_once("can-isotp: cfecho is %08X != 0\n", so->cfecho); /* set consecutive frame echo tag */ so->cfecho = *(u32 *)cf->data; /* send frame with local echo enabled */ can_send_ret = can_send(skb, 1); if (can_send_ret) { pr_notice_once("can-isotp: %s: can_send_ret %pe\n", __func__, ERR_PTR(can_send_ret)); if (can_send_ret == -ENOBUFS) pr_notice_once("can-isotp: tx queue is full\n"); } dev_put(dev); } static void isotp_create_fframe(struct canfd_frame *cf, struct isotp_sock *so, int ae) { int i; int ff_pci_sz; cf->can_id = so->txid; cf->len = so->tx.ll_dl; if (ae) cf->data[0] = so->opt.ext_address; /* create N_PCI bytes with 12/32 bit FF_DL data length */ if (so->tx.len > MAX_12BIT_PDU_SIZE) { /* use 32 bit FF_DL notation */ cf->data[ae] = N_PCI_FF; cf->data[ae + 1] = 0; cf->data[ae + 2] = (u8)(so->tx.len >> 24) & 0xFFU; cf->data[ae + 3] = (u8)(so->tx.len >> 16) & 0xFFU; cf->data[ae + 4] = (u8)(so->tx.len >> 8) & 0xFFU; cf->data[ae + 5] = (u8)so->tx.len & 0xFFU; ff_pci_sz = FF_PCI_SZ32; } else { /* use 12 bit FF_DL notation */ cf->data[ae] = (u8)(so->tx.len >> 8) | N_PCI_FF; cf->data[ae + 1] = (u8)so->tx.len & 0xFFU; ff_pci_sz = FF_PCI_SZ12; } /* add first data bytes depending on ae */ for (i = ae + ff_pci_sz; i < so->tx.ll_dl; i++) cf->data[i] = so->tx.buf[so->tx.idx++]; so->tx.sn = 1; } static void isotp_rcv_echo(struct sk_buff *skb, void *data) { struct sock *sk = (struct sock *)data; struct isotp_sock *so = isotp_sk(sk); struct canfd_frame *cf = (struct canfd_frame *)skb->data; /* only handle my own local echo CF/SF skb's (no FF!) */ if (skb->sk != sk || so->cfecho != *(u32 *)cf->data) return; /* cancel local echo timeout */ hrtimer_cancel(&so->txtimer); /* local echo skb with consecutive frame has been consumed */ so->cfecho = 0; if (so->tx.idx >= so->tx.len) { /* we are done */ so->tx.state = ISOTP_IDLE; wake_up_interruptible(&so->wait); return; } if (so->txfc.bs && so->tx.bs >= so->txfc.bs) { /* stop and wait for FC with timeout */ so->tx.state = ISOTP_WAIT_FC; hrtimer_start(&so->txtimer, ktime_set(ISOTP_FC_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); return; } /* no gap between data frames needed => use burst mode */ if (!so->tx_gap) { /* enable echo timeout handling */ hrtimer_start(&so->txtimer, ktime_set(ISOTP_ECHO_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); isotp_send_cframe(so); return; } /* start timer to send next consecutive frame with correct delay */ hrtimer_start(&so->txfrtimer, so->tx_gap, HRTIMER_MODE_REL_SOFT); } static enum hrtimer_restart isotp_tx_timer_handler(struct hrtimer *hrtimer) { struct isotp_sock *so = container_of(hrtimer, struct isotp_sock, txtimer); struct sock *sk = &so->sk; /* don't handle timeouts in IDLE or SHUTDOWN state */ if (so->tx.state == ISOTP_IDLE || so->tx.state == ISOTP_SHUTDOWN) return HRTIMER_NORESTART; /* we did not get any flow control or echo frame in time */ /* report 'communication error on send' */ sk->sk_err = ECOMM; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); /* reset tx state */ so->tx.state = ISOTP_IDLE; wake_up_interruptible(&so->wait); return HRTIMER_NORESTART; } static enum hrtimer_restart isotp_txfr_timer_handler(struct hrtimer *hrtimer) { struct isotp_sock *so = container_of(hrtimer, struct isotp_sock, txfrtimer); /* start echo timeout handling and cover below protocol error */ hrtimer_start(&so->txtimer, ktime_set(ISOTP_ECHO_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); /* cfecho should be consumed by isotp_rcv_echo() here */ if (so->tx.state == ISOTP_SENDING && !so->cfecho) isotp_send_cframe(so); return HRTIMER_NORESTART; } static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); struct sk_buff *skb; struct net_device *dev; struct canfd_frame *cf; int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0; int wait_tx_done = (so->opt.flags & CAN_ISOTP_WAIT_TX_DONE) ? 1 : 0; s64 hrtimer_sec = ISOTP_ECHO_TIMEOUT; int off; int err; if (!so->bound || so->tx.state == ISOTP_SHUTDOWN) return -EADDRNOTAVAIL; while (cmpxchg(&so->tx.state, ISOTP_IDLE, ISOTP_SENDING) != ISOTP_IDLE) { /* we do not support multiple buffers - for now */ if (msg->msg_flags & MSG_DONTWAIT) return -EAGAIN; if (so->tx.state == ISOTP_SHUTDOWN) return -EADDRNOTAVAIL; /* wait for complete transmission of current pdu */ err = wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); if (err) goto err_event_drop; } /* PDU size > default => try max_pdu_size */ if (size > so->tx.buflen && so->tx.buflen < max_pdu_size) { u8 *newbuf = kmalloc(max_pdu_size, GFP_KERNEL); if (newbuf) { so->tx.buf = newbuf; so->tx.buflen = max_pdu_size; } } if (!size || size > so->tx.buflen) { err = -EINVAL; goto err_out_drop; } /* take care of a potential SF_DL ESC offset for TX_DL > 8 */ off = (so->tx.ll_dl > CAN_MAX_DLEN) ? 1 : 0; /* does the given data fit into a single frame for SF_BROADCAST? */ if ((isotp_bc_flags(so) == CAN_ISOTP_SF_BROADCAST) && (size > so->tx.ll_dl - SF_PCI_SZ4 - ae - off)) { err = -EINVAL; goto err_out_drop; } err = memcpy_from_msg(so->tx.buf, msg, size); if (err < 0) goto err_out_drop; dev = dev_get_by_index(sock_net(sk), so->ifindex); if (!dev) { err = -ENXIO; goto err_out_drop; } skb = sock_alloc_send_skb(sk, so->ll.mtu + sizeof(struct can_skb_priv), msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) { dev_put(dev); goto err_out_drop; } can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; can_skb_prv(skb)->skbcnt = 0; so->tx.len = size; so->tx.idx = 0; cf = (struct canfd_frame *)skb->data; skb_put_zero(skb, so->ll.mtu); /* cfecho should have been zero'ed by init / former isotp_rcv_echo() */ if (so->cfecho) pr_notice_once("can-isotp: uninit cfecho %08X\n", so->cfecho); /* check for single frame transmission depending on TX_DL */ if (size <= so->tx.ll_dl - SF_PCI_SZ4 - ae - off) { /* The message size generally fits into a SingleFrame - good. * * SF_DL ESC offset optimization: * * When TX_DL is greater 8 but the message would still fit * into a 8 byte CAN frame, we can omit the offset. * This prevents a protocol caused length extension from * CAN_DL = 8 to CAN_DL = 12 due to the SF_SL ESC handling. */ if (size <= CAN_MAX_DLEN - SF_PCI_SZ4 - ae) off = 0; isotp_fill_dataframe(cf, so, ae, off); /* place single frame N_PCI w/o length in appropriate index */ cf->data[ae] = N_PCI_SF; /* place SF_DL size value depending on the SF_DL ESC offset */ if (off) cf->data[SF_PCI_SZ4 + ae] = size; else cf->data[ae] |= size; /* set CF echo tag for isotp_rcv_echo() (SF-mode) */ so->cfecho = *(u32 *)cf->data; } else { /* send first frame */ isotp_create_fframe(cf, so, ae); if (isotp_bc_flags(so) == CAN_ISOTP_CF_BROADCAST) { /* set timer for FC-less operation (STmin = 0) */ if (so->opt.flags & CAN_ISOTP_FORCE_TXSTMIN) so->tx_gap = ktime_set(0, so->force_tx_stmin); else so->tx_gap = ktime_set(0, so->frame_txtime); /* disable wait for FCs due to activated block size */ so->txfc.bs = 0; /* set CF echo tag for isotp_rcv_echo() (CF-mode) */ so->cfecho = *(u32 *)cf->data; } else { /* standard flow control check */ so->tx.state = ISOTP_WAIT_FIRST_FC; /* start timeout for FC */ hrtimer_sec = ISOTP_FC_TIMEOUT; /* no CF echo tag for isotp_rcv_echo() (FF-mode) */ so->cfecho = 0; } } hrtimer_start(&so->txtimer, ktime_set(hrtimer_sec, 0), HRTIMER_MODE_REL_SOFT); /* send the first or only CAN frame */ cf->flags = so->ll.tx_flags; skb->dev = dev; skb->sk = sk; err = can_send(skb, 1); dev_put(dev); if (err) { pr_notice_once("can-isotp: %s: can_send_ret %pe\n", __func__, ERR_PTR(err)); /* no transmission -> no timeout monitoring */ hrtimer_cancel(&so->txtimer); /* reset consecutive frame echo tag */ so->cfecho = 0; goto err_out_drop; } if (wait_tx_done) { /* wait for complete transmission of current pdu */ err = wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); if (err) goto err_event_drop; err = sock_error(sk); if (err) return err; } return size; err_event_drop: /* got signal: force tx state machine to be idle */ so->tx.state = ISOTP_IDLE; hrtimer_cancel(&so->txfrtimer); hrtimer_cancel(&so->txtimer); err_out_drop: /* drop this PDU and unlock a potential wait queue */ so->tx.state = ISOTP_IDLE; wake_up_interruptible(&so->wait); return err; } static int isotp_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; struct isotp_sock *so = isotp_sk(sk); int ret = 0; if (flags & ~(MSG_DONTWAIT | MSG_TRUNC | MSG_PEEK | MSG_CMSG_COMPAT)) return -EINVAL; if (!so->bound) return -EADDRNOTAVAIL; skb = skb_recv_datagram(sk, flags, &ret); if (!skb) return ret; if (size < skb->len) msg->msg_flags |= MSG_TRUNC; else size = skb->len; ret = memcpy_to_msg(msg, skb->data, size); if (ret < 0) goto out_err; sock_recv_cmsgs(msg, sk, skb); if (msg->msg_name) { __sockaddr_check_size(ISOTP_MIN_NAMELEN); msg->msg_namelen = ISOTP_MIN_NAMELEN; memcpy(msg->msg_name, skb->cb, msg->msg_namelen); } /* set length of return value */ ret = (flags & MSG_TRUNC) ? skb->len : size; out_err: skb_free_datagram(sk, skb); return ret; } static int isotp_release(struct socket *sock) { struct sock *sk = sock->sk; struct isotp_sock *so; struct net *net; if (!sk) return 0; so = isotp_sk(sk); net = sock_net(sk); /* wait for complete transmission of current pdu */ while (wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE) == 0 && cmpxchg(&so->tx.state, ISOTP_IDLE, ISOTP_SHUTDOWN) != ISOTP_IDLE) ; /* force state machines to be idle also when a signal occurred */ so->tx.state = ISOTP_SHUTDOWN; so->rx.state = ISOTP_IDLE; spin_lock(&isotp_notifier_lock); while (isotp_busy_notifier == so) { spin_unlock(&isotp_notifier_lock); schedule_timeout_uninterruptible(1); spin_lock(&isotp_notifier_lock); } list_del(&so->notifier); spin_unlock(&isotp_notifier_lock); lock_sock(sk); /* remove current filters & unregister */ if (so->bound) { if (so->ifindex) { struct net_device *dev; dev = dev_get_by_index(net, so->ifindex); if (dev) { if (isotp_register_rxid(so)) can_rx_unregister(net, dev, so->rxid, SINGLE_MASK(so->rxid), isotp_rcv, sk); can_rx_unregister(net, dev, so->txid, SINGLE_MASK(so->txid), isotp_rcv_echo, sk); dev_put(dev); synchronize_rcu(); } } } hrtimer_cancel(&so->txfrtimer); hrtimer_cancel(&so->txtimer); hrtimer_cancel(&so->rxtimer); so->ifindex = 0; so->bound = 0; if (so->rx.buf != so->rx.sbuf) kfree(so->rx.buf); if (so->tx.buf != so->tx.sbuf) kfree(so->tx.buf); sock_orphan(sk); sock->sk = NULL; release_sock(sk); sock_prot_inuse_add(net, sk->sk_prot, -1); sock_put(sk); return 0; } static int isotp_bind(struct socket *sock, struct sockaddr *uaddr, int len) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); struct net *net = sock_net(sk); int ifindex; struct net_device *dev; canid_t tx_id = addr->can_addr.tp.tx_id; canid_t rx_id = addr->can_addr.tp.rx_id; int err = 0; int notify_enetdown = 0; if (len < ISOTP_MIN_NAMELEN) return -EINVAL; if (addr->can_family != AF_CAN) return -EINVAL; /* sanitize tx CAN identifier */ if (tx_id & CAN_EFF_FLAG) tx_id &= (CAN_EFF_FLAG | CAN_EFF_MASK); else tx_id &= CAN_SFF_MASK; /* give feedback on wrong CAN-ID value */ if (tx_id != addr->can_addr.tp.tx_id) return -EINVAL; /* sanitize rx CAN identifier (if needed) */ if (isotp_register_rxid(so)) { if (rx_id & CAN_EFF_FLAG) rx_id &= (CAN_EFF_FLAG | CAN_EFF_MASK); else rx_id &= CAN_SFF_MASK; /* give feedback on wrong CAN-ID value */ if (rx_id != addr->can_addr.tp.rx_id) return -EINVAL; } if (!addr->can_ifindex) return -ENODEV; lock_sock(sk); if (so->bound) { err = -EINVAL; goto out; } /* ensure different CAN IDs when the rx_id is to be registered */ if (isotp_register_rxid(so) && rx_id == tx_id) { err = -EADDRNOTAVAIL; goto out; } dev = dev_get_by_index(net, addr->can_ifindex); if (!dev) { err = -ENODEV; goto out; } if (dev->type != ARPHRD_CAN) { dev_put(dev); err = -ENODEV; goto out; } if (dev->mtu < so->ll.mtu) { dev_put(dev); err = -EINVAL; goto out; } if (!(dev->flags & IFF_UP)) notify_enetdown = 1; ifindex = dev->ifindex; if (isotp_register_rxid(so)) can_rx_register(net, dev, rx_id, SINGLE_MASK(rx_id), isotp_rcv, sk, "isotp", sk); /* no consecutive frame echo skb in flight */ so->cfecho = 0; /* register for echo skb's */ can_rx_register(net, dev, tx_id, SINGLE_MASK(tx_id), isotp_rcv_echo, sk, "isotpe", sk); dev_put(dev); /* switch to new settings */ so->ifindex = ifindex; so->rxid = rx_id; so->txid = tx_id; so->bound = 1; out: release_sock(sk); if (notify_enetdown) { sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); } return err; } static int isotp_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); if (peer) return -EOPNOTSUPP; memset(addr, 0, ISOTP_MIN_NAMELEN); addr->can_family = AF_CAN; addr->can_ifindex = so->ifindex; addr->can_addr.tp.rx_id = so->rxid; addr->can_addr.tp.tx_id = so->txid; return ISOTP_MIN_NAMELEN; } static int isotp_setsockopt_locked(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); int ret = 0; if (so->bound) return -EISCONN; switch (optname) { case CAN_ISOTP_OPTS: if (optlen != sizeof(struct can_isotp_options)) return -EINVAL; if (copy_from_sockptr(&so->opt, optval, optlen)) return -EFAULT; /* no separate rx_ext_address is given => use ext_address */ if (!(so->opt.flags & CAN_ISOTP_RX_EXT_ADDR)) so->opt.rx_ext_address = so->opt.ext_address; /* these broadcast flags are not allowed together */ if (isotp_bc_flags(so) == ISOTP_ALL_BC_FLAGS) { /* CAN_ISOTP_SF_BROADCAST is prioritized */ so->opt.flags &= ~CAN_ISOTP_CF_BROADCAST; /* give user feedback on wrong config attempt */ ret = -EINVAL; } /* check for frame_txtime changes (0 => no changes) */ if (so->opt.frame_txtime) { if (so->opt.frame_txtime == CAN_ISOTP_FRAME_TXTIME_ZERO) so->frame_txtime = 0; else so->frame_txtime = so->opt.frame_txtime; } break; case CAN_ISOTP_RECV_FC: if (optlen != sizeof(struct can_isotp_fc_options)) return -EINVAL; if (copy_from_sockptr(&so->rxfc, optval, optlen)) return -EFAULT; break; case CAN_ISOTP_TX_STMIN: if (optlen != sizeof(u32)) return -EINVAL; if (copy_from_sockptr(&so->force_tx_stmin, optval, optlen)) return -EFAULT; break; case CAN_ISOTP_RX_STMIN: if (optlen != sizeof(u32)) return -EINVAL; if (copy_from_sockptr(&so->force_rx_stmin, optval, optlen)) return -EFAULT; break; case CAN_ISOTP_LL_OPTS: if (optlen == sizeof(struct can_isotp_ll_options)) { struct can_isotp_ll_options ll; if (copy_from_sockptr(&ll, optval, optlen)) return -EFAULT; /* check for correct ISO 11898-1 DLC data length */ if (ll.tx_dl != padlen(ll.tx_dl)) return -EINVAL; if (ll.mtu != CAN_MTU && ll.mtu != CANFD_MTU) return -EINVAL; if (ll.mtu == CAN_MTU && (ll.tx_dl > CAN_MAX_DLEN || ll.tx_flags != 0)) return -EINVAL; memcpy(&so->ll, &ll, sizeof(ll)); /* set ll_dl for tx path to similar place as for rx */ so->tx.ll_dl = ll.tx_dl; } else { return -EINVAL; } break; default: ret = -ENOPROTOOPT; } return ret; } static int isotp_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; int ret; if (level != SOL_CAN_ISOTP) return -EINVAL; lock_sock(sk); ret = isotp_setsockopt_locked(sock, level, optname, optval, optlen); release_sock(sk); return ret; } static int isotp_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); int len; void *val; if (level != SOL_CAN_ISOTP) return -EINVAL; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; switch (optname) { case CAN_ISOTP_OPTS: len = min_t(int, len, sizeof(struct can_isotp_options)); val = &so->opt; break; case CAN_ISOTP_RECV_FC: len = min_t(int, len, sizeof(struct can_isotp_fc_options)); val = &so->rxfc; break; case CAN_ISOTP_TX_STMIN: len = min_t(int, len, sizeof(u32)); val = &so->force_tx_stmin; break; case CAN_ISOTP_RX_STMIN: len = min_t(int, len, sizeof(u32)); val = &so->force_rx_stmin; break; case CAN_ISOTP_LL_OPTS: len = min_t(int, len, sizeof(struct can_isotp_ll_options)); val = &so->ll; break; default: return -ENOPROTOOPT; } if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, val, len)) return -EFAULT; return 0; } static void isotp_notify(struct isotp_sock *so, unsigned long msg, struct net_device *dev) { struct sock *sk = &so->sk; if (!net_eq(dev_net(dev), sock_net(sk))) return; if (so->ifindex != dev->ifindex) return; switch (msg) { case NETDEV_UNREGISTER: lock_sock(sk); /* remove current filters & unregister */ if (so->bound) { if (isotp_register_rxid(so)) can_rx_unregister(dev_net(dev), dev, so->rxid, SINGLE_MASK(so->rxid), isotp_rcv, sk); can_rx_unregister(dev_net(dev), dev, so->txid, SINGLE_MASK(so->txid), isotp_rcv_echo, sk); } so->ifindex = 0; so->bound = 0; release_sock(sk); sk->sk_err = ENODEV; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); break; case NETDEV_DOWN: sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); break; } } static int isotp_notifier(struct notifier_block *nb, unsigned long msg, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (dev->type != ARPHRD_CAN) return NOTIFY_DONE; if (msg != NETDEV_UNREGISTER && msg != NETDEV_DOWN) return NOTIFY_DONE; if (unlikely(isotp_busy_notifier)) /* Check for reentrant bug. */ return NOTIFY_DONE; spin_lock(&isotp_notifier_lock); list_for_each_entry(isotp_busy_notifier, &isotp_notifier_list, notifier) { spin_unlock(&isotp_notifier_lock); isotp_notify(isotp_busy_notifier, msg, dev); spin_lock(&isotp_notifier_lock); } isotp_busy_notifier = NULL; spin_unlock(&isotp_notifier_lock); return NOTIFY_DONE; } static int isotp_init(struct sock *sk) { struct isotp_sock *so = isotp_sk(sk); so->ifindex = 0; so->bound = 0; so->opt.flags = CAN_ISOTP_DEFAULT_FLAGS; so->opt.ext_address = CAN_ISOTP_DEFAULT_EXT_ADDRESS; so->opt.rx_ext_address = CAN_ISOTP_DEFAULT_EXT_ADDRESS; so->opt.rxpad_content = CAN_ISOTP_DEFAULT_PAD_CONTENT; so->opt.txpad_content = CAN_ISOTP_DEFAULT_PAD_CONTENT; so->opt.frame_txtime = CAN_ISOTP_DEFAULT_FRAME_TXTIME; so->frame_txtime = CAN_ISOTP_DEFAULT_FRAME_TXTIME; so->rxfc.bs = CAN_ISOTP_DEFAULT_RECV_BS; so->rxfc.stmin = CAN_ISOTP_DEFAULT_RECV_STMIN; so->rxfc.wftmax = CAN_ISOTP_DEFAULT_RECV_WFTMAX; so->ll.mtu = CAN_ISOTP_DEFAULT_LL_MTU; so->ll.tx_dl = CAN_ISOTP_DEFAULT_LL_TX_DL; so->ll.tx_flags = CAN_ISOTP_DEFAULT_LL_TX_FLAGS; /* set ll_dl for tx path to similar place as for rx */ so->tx.ll_dl = so->ll.tx_dl; so->rx.state = ISOTP_IDLE; so->tx.state = ISOTP_IDLE; so->rx.buf = so->rx.sbuf; so->tx.buf = so->tx.sbuf; so->rx.buflen = ARRAY_SIZE(so->rx.sbuf); so->tx.buflen = ARRAY_SIZE(so->tx.sbuf); hrtimer_setup(&so->rxtimer, isotp_rx_timer_handler, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); hrtimer_setup(&so->txtimer, isotp_tx_timer_handler, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); hrtimer_setup(&so->txfrtimer, isotp_txfr_timer_handler, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); init_waitqueue_head(&so->wait); spin_lock_init(&so->rx_lock); spin_lock(&isotp_notifier_lock); list_add_tail(&so->notifier, &isotp_notifier_list); spin_unlock(&isotp_notifier_lock); return 0; } static __poll_t isotp_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); __poll_t mask = datagram_poll(file, sock, wait); poll_wait(file, &so->wait, wait); /* Check for false positives due to TX state */ if ((mask & EPOLLWRNORM) && (so->tx.state != ISOTP_IDLE)) mask &= ~(EPOLLOUT | EPOLLWRNORM); return mask; } static int isotp_sock_no_ioctlcmd(struct socket *sock, unsigned int cmd, unsigned long arg) { /* no ioctls for socket layer -> hand it down to NIC layer */ return -ENOIOCTLCMD; } static const struct proto_ops isotp_ops = { .family = PF_CAN, .release = isotp_release, .bind = isotp_bind, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = isotp_getname, .poll = isotp_poll, .ioctl = isotp_sock_no_ioctlcmd, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = isotp_setsockopt, .getsockopt = isotp_getsockopt, .sendmsg = isotp_sendmsg, .recvmsg = isotp_recvmsg, .mmap = sock_no_mmap, }; static struct proto isotp_proto __read_mostly = { .name = "CAN_ISOTP", .owner = THIS_MODULE, .obj_size = sizeof(struct isotp_sock), .init = isotp_init, }; static const struct can_proto isotp_can_proto = { .type = SOCK_DGRAM, .protocol = CAN_ISOTP, .ops = &isotp_ops, .prot = &isotp_proto, }; static struct notifier_block canisotp_notifier = { .notifier_call = isotp_notifier }; static __init int isotp_module_init(void) { int err; max_pdu_size = max_t(unsigned int, max_pdu_size, MAX_12BIT_PDU_SIZE); max_pdu_size = min_t(unsigned int, max_pdu_size, MAX_PDU_SIZE); pr_info("can: isotp protocol (max_pdu_size %d)\n", max_pdu_size); err = can_proto_register(&isotp_can_proto); if (err < 0) pr_err("can: registration of isotp protocol failed %pe\n", ERR_PTR(err)); else register_netdevice_notifier(&canisotp_notifier); return err; } static __exit void isotp_module_exit(void) { can_proto_unregister(&isotp_can_proto); unregister_netdevice_notifier(&canisotp_notifier); } module_init(isotp_module_init); module_exit(isotp_module_exit);
8 8 8 8 6 6 6 8 6 8 8 6 8 8 9 29 37 37 9 29 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef BLK_INTERNAL_H #define BLK_INTERNAL_H #include <linux/bio-integrity.h> #include <linux/blk-crypto.h> #include <linux/lockdep.h> #include <linux/memblock.h> /* for max_pfn/max_low_pfn */ #include <linux/sched/sysctl.h> #include <linux/timekeeping.h> #include <xen/xen.h> #include "blk-crypto-internal.h" struct elevator_type; #define BLK_DEV_MAX_SECTORS (LLONG_MAX >> 9) #define BLK_MIN_SEGMENT_SIZE 4096 /* Max future timer expiry for timeouts */ #define BLK_MAX_TIMEOUT (5 * HZ) extern struct dentry *blk_debugfs_root; struct blk_flush_queue { spinlock_t mq_flush_lock; unsigned int flush_pending_idx:1; unsigned int flush_running_idx:1; blk_status_t rq_status; unsigned long flush_pending_since; struct list_head flush_queue[2]; unsigned long flush_data_in_flight; struct request *flush_rq; }; bool is_flush_rq(struct request *req); struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, gfp_t flags); void blk_free_flush_queue(struct blk_flush_queue *q); bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic); bool blk_queue_start_drain(struct request_queue *q); bool __blk_freeze_queue_start(struct request_queue *q, struct task_struct *owner); int __bio_queue_enter(struct request_queue *q, struct bio *bio); void submit_bio_noacct_nocheck(struct bio *bio); void bio_await_chain(struct bio *bio); static inline bool blk_try_enter_queue(struct request_queue *q, bool pm) { rcu_read_lock(); if (!percpu_ref_tryget_live_rcu(&q->q_usage_counter)) goto fail; /* * The code that increments the pm_only counter must ensure that the * counter is globally visible before the queue is unfrozen. */ if (blk_queue_pm_only(q) && (!pm || queue_rpm_status(q) == RPM_SUSPENDED)) goto fail_put; rcu_read_unlock(); return true; fail_put: blk_queue_exit(q); fail: rcu_read_unlock(); return false; } static inline int bio_queue_enter(struct bio *bio) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); if (blk_try_enter_queue(q, false)) { rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_); rwsem_release(&q->io_lockdep_map, _RET_IP_); return 0; } return __bio_queue_enter(q, bio); } static inline void blk_wait_io(struct completion *done) { /* Prevent hang_check timer from firing at us during very long I/O */ unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2; if (timeout) while (!wait_for_completion_io_timeout(done, timeout)) ; else wait_for_completion_io(done); } struct block_device *blkdev_get_no_open(dev_t dev, bool autoload); void blkdev_put_no_open(struct block_device *bdev); #define BIO_INLINE_VECS 4 struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, gfp_t gfp_mask); void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs); bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, struct page *page, unsigned len, unsigned offset, bool *same_page); static inline bool biovec_phys_mergeable(struct request_queue *q, struct bio_vec *vec1, struct bio_vec *vec2) { unsigned long mask = queue_segment_boundary(q); phys_addr_t addr1 = bvec_phys(vec1); phys_addr_t addr2 = bvec_phys(vec2); /* * Merging adjacent physical pages may not work correctly under KMSAN * if their metadata pages aren't adjacent. Just disable merging. */ if (IS_ENABLED(CONFIG_KMSAN)) return false; if (addr1 + vec1->bv_len != addr2) return false; if (xen_domain() && !xen_biovec_phys_mergeable(vec1, vec2->bv_page)) return false; if ((addr1 | mask) != ((addr2 + vec2->bv_len - 1) | mask)) return false; return true; } static inline bool __bvec_gap_to_prev(const struct queue_limits *lim, struct bio_vec *bprv, unsigned int offset) { return (offset & lim->virt_boundary_mask) || ((bprv->bv_offset + bprv->bv_len) & lim->virt_boundary_mask); } /* * Check if adding a bio_vec after bprv with offset would create a gap in * the SG list. Most drivers don't care about this, but some do. */ static inline bool bvec_gap_to_prev(const struct queue_limits *lim, struct bio_vec *bprv, unsigned int offset) { if (!lim->virt_boundary_mask) return false; return __bvec_gap_to_prev(lim, bprv, offset); } static inline bool rq_mergeable(struct request *rq) { if (blk_rq_is_passthrough(rq)) return false; if (req_op(rq) == REQ_OP_FLUSH) return false; if (req_op(rq) == REQ_OP_WRITE_ZEROES) return false; if (req_op(rq) == REQ_OP_ZONE_APPEND) return false; if (rq->cmd_flags & REQ_NOMERGE_FLAGS) return false; if (rq->rq_flags & RQF_NOMERGE_FLAGS) return false; return true; } /* * There are two different ways to handle DISCARD merges: * 1) If max_discard_segments > 1, the driver treats every bio as a range and * send the bios to controller together. The ranges don't need to be * contiguous. * 2) Otherwise, the request will be normal read/write requests. The ranges * need to be contiguous. */ static inline bool blk_discard_mergable(struct request *req) { if (req_op(req) == REQ_OP_DISCARD && queue_max_discard_segments(req->q) > 1) return true; return false; } static inline unsigned int blk_rq_get_max_segments(struct request *rq) { if (req_op(rq) == REQ_OP_DISCARD) return queue_max_discard_segments(rq->q); return queue_max_segments(rq->q); } static inline unsigned int blk_queue_get_max_sectors(struct request *rq) { struct request_queue *q = rq->q; enum req_op op = req_op(rq); if (unlikely(op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE)) return min(q->limits.max_discard_sectors, UINT_MAX >> SECTOR_SHIFT); if (unlikely(op == REQ_OP_WRITE_ZEROES)) return q->limits.max_write_zeroes_sectors; if (rq->cmd_flags & REQ_ATOMIC) return q->limits.atomic_write_max_sectors; return q->limits.max_sectors; } #ifdef CONFIG_BLK_DEV_INTEGRITY void blk_flush_integrity(void); void bio_integrity_free(struct bio *bio); /* * Integrity payloads can either be owned by the submitter, in which case * bio_uninit will free them, or owned and generated by the block layer, * in which case we'll verify them here (for reads) and free them before * the bio is handed back to the submitted. */ bool __bio_integrity_endio(struct bio *bio); static inline bool bio_integrity_endio(struct bio *bio) { struct bio_integrity_payload *bip = bio_integrity(bio); if (bip && (bip->bip_flags & BIP_BLOCK_INTEGRITY)) return __bio_integrity_endio(bio); return true; } bool blk_integrity_merge_rq(struct request_queue *, struct request *, struct request *); bool blk_integrity_merge_bio(struct request_queue *, struct request *, struct bio *); static inline bool integrity_req_gap_back_merge(struct request *req, struct bio *next) { struct bio_integrity_payload *bip = bio_integrity(req->bio); struct bio_integrity_payload *bip_next = bio_integrity(next); return bvec_gap_to_prev(&req->q->limits, &bip->bip_vec[bip->bip_vcnt - 1], bip_next->bip_vec[0].bv_offset); } static inline bool integrity_req_gap_front_merge(struct request *req, struct bio *bio) { struct bio_integrity_payload *bip = bio_integrity(bio); struct bio_integrity_payload *bip_next = bio_integrity(req->bio); return bvec_gap_to_prev(&req->q->limits, &bip->bip_vec[bip->bip_vcnt - 1], bip_next->bip_vec[0].bv_offset); } extern const struct attribute_group blk_integrity_attr_group; #else /* CONFIG_BLK_DEV_INTEGRITY */ static inline bool blk_integrity_merge_rq(struct request_queue *rq, struct request *r1, struct request *r2) { return true; } static inline bool blk_integrity_merge_bio(struct request_queue *rq, struct request *r, struct bio *b) { return true; } static inline bool integrity_req_gap_back_merge(struct request *req, struct bio *next) { return false; } static inline bool integrity_req_gap_front_merge(struct request *req, struct bio *bio) { return false; } static inline void blk_flush_integrity(void) { } static inline bool bio_integrity_endio(struct bio *bio) { return true; } static inline void bio_integrity_free(struct bio *bio) { } #endif /* CONFIG_BLK_DEV_INTEGRITY */ unsigned long blk_rq_timeout(unsigned long timeout); void blk_add_timer(struct request *req); enum bio_merge_status { BIO_MERGE_OK, BIO_MERGE_NONE, BIO_MERGE_FAILED, }; enum bio_merge_status bio_attempt_back_merge(struct request *req, struct bio *bio, unsigned int nr_segs); bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs); bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, struct bio *bio, unsigned int nr_segs); /* * Plug flush limits */ #define BLK_MAX_REQUEST_COUNT 32 #define BLK_PLUG_FLUSH_SIZE (128 * 1024) /* * Internal elevator interface */ #define ELV_ON_HASH(rq) ((rq)->rq_flags & RQF_HASHED) bool blk_insert_flush(struct request *rq); int elevator_switch(struct request_queue *q, struct elevator_type *new_e); void elevator_disable(struct request_queue *q); void elevator_exit(struct request_queue *q); int elv_register_queue(struct request_queue *q, bool uevent); void elv_unregister_queue(struct request_queue *q); ssize_t part_size_show(struct device *dev, struct device_attribute *attr, char *buf); ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, char *buf); ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, char *buf); ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, char *buf); ssize_t part_fail_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count); ssize_t part_timeout_show(struct device *, struct device_attribute *, char *); ssize_t part_timeout_store(struct device *, struct device_attribute *, const char *, size_t); struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim, unsigned *nsegs); struct bio *bio_split_write_zeroes(struct bio *bio, const struct queue_limits *lim, unsigned *nsegs); struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, unsigned *nr_segs); struct bio *bio_split_zone_append(struct bio *bio, const struct queue_limits *lim, unsigned *nr_segs); /* * All drivers must accept single-segments bios that are smaller than PAGE_SIZE. * * This is a quick and dirty check that relies on the fact that bi_io_vec[0] is * always valid if a bio has data. The check might lead to occasional false * positives when bios are cloned, but compared to the performance impact of * cloned bios themselves the loop below doesn't matter anyway. */ static inline bool bio_may_need_split(struct bio *bio, const struct queue_limits *lim) { if (lim->chunk_sectors) return true; if (bio->bi_vcnt != 1) return true; return bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > lim->min_segment_size; } /** * __bio_split_to_limits - split a bio to fit the queue limits * @bio: bio to be split * @lim: queue limits to split based on * @nr_segs: returns the number of segments in the returned bio * * Check if @bio needs splitting based on the queue limits, and if so split off * a bio fitting the limits from the beginning of @bio and return it. @bio is * shortened to the remainder and re-submitted. * * The split bio is allocated from @q->bio_split, which is provided by the * block layer. */ static inline struct bio *__bio_split_to_limits(struct bio *bio, const struct queue_limits *lim, unsigned int *nr_segs) { switch (bio_op(bio)) { case REQ_OP_READ: case REQ_OP_WRITE: if (bio_may_need_split(bio, lim)) return bio_split_rw(bio, lim, nr_segs); *nr_segs = 1; return bio; case REQ_OP_ZONE_APPEND: return bio_split_zone_append(bio, lim, nr_segs); case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: return bio_split_discard(bio, lim, nr_segs); case REQ_OP_WRITE_ZEROES: return bio_split_write_zeroes(bio, lim, nr_segs); default: /* other operations can't be split */ *nr_segs = 0; return bio; } } int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs); bool blk_attempt_req_merge(struct request_queue *q, struct request *rq, struct request *next); unsigned int blk_recalc_rq_segments(struct request *rq); bool blk_rq_merge_ok(struct request *rq, struct bio *bio); enum elv_merge blk_try_merge(struct request *rq, struct bio *bio); int blk_set_default_limits(struct queue_limits *lim); void blk_apply_bdi_limits(struct backing_dev_info *bdi, struct queue_limits *lim); int blk_dev_init(void); void update_io_ticks(struct block_device *part, unsigned long now, bool end); unsigned int part_in_flight(struct block_device *part); static inline void req_set_nomerge(struct request_queue *q, struct request *req) { req->cmd_flags |= REQ_NOMERGE; if (req == q->last_merge) q->last_merge = NULL; } /* * Internal io_context interface */ struct io_cq *ioc_find_get_icq(struct request_queue *q); struct io_cq *ioc_lookup_icq(struct request_queue *q); #ifdef CONFIG_BLK_ICQ void ioc_clear_queue(struct request_queue *q); #else static inline void ioc_clear_queue(struct request_queue *q) { } #endif /* CONFIG_BLK_ICQ */ struct bio *__blk_queue_bounce(struct bio *bio, struct request_queue *q); static inline bool blk_queue_may_bounce(struct request_queue *q) { return IS_ENABLED(CONFIG_BOUNCE) && (q->limits.features & BLK_FEAT_BOUNCE_HIGH) && max_low_pfn >= max_pfn; } static inline struct bio *blk_queue_bounce(struct bio *bio, struct request_queue *q) { if (unlikely(blk_queue_may_bounce(q) && bio_has_data(bio))) return __blk_queue_bounce(bio, q); return bio; } #ifdef CONFIG_BLK_DEV_ZONED void disk_init_zone_resources(struct gendisk *disk); void disk_free_zone_resources(struct gendisk *disk); static inline bool bio_zone_write_plugging(struct bio *bio) { return bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING); } void blk_zone_write_plug_bio_merged(struct bio *bio); void blk_zone_write_plug_init_request(struct request *rq); static inline void blk_zone_update_request_bio(struct request *rq, struct bio *bio) { /* * For zone append requests, the request sector indicates the location * at which the BIO data was written. Return this value to the BIO * issuer through the BIO iter sector. * For plugged zone writes, which include emulated zone append, we need * the original BIO sector so that blk_zone_write_plug_bio_endio() can * lookup the zone write plug. */ if (req_op(rq) == REQ_OP_ZONE_APPEND || bio_zone_write_plugging(bio)) bio->bi_iter.bi_sector = rq->__sector; } void blk_zone_write_plug_bio_endio(struct bio *bio); static inline void blk_zone_bio_endio(struct bio *bio) { /* * For write BIOs to zoned devices, signal the completion of the BIO so * that the next write BIO can be submitted by zone write plugging. */ if (bio_zone_write_plugging(bio)) blk_zone_write_plug_bio_endio(bio); } void blk_zone_write_plug_finish_request(struct request *rq); static inline void blk_zone_finish_request(struct request *rq) { if (rq->rq_flags & RQF_ZONE_WRITE_PLUGGING) blk_zone_write_plug_finish_request(rq); } int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, unsigned long arg); int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg); #else /* CONFIG_BLK_DEV_ZONED */ static inline void disk_init_zone_resources(struct gendisk *disk) { } static inline void disk_free_zone_resources(struct gendisk *disk) { } static inline bool bio_zone_write_plugging(struct bio *bio) { return false; } static inline void blk_zone_write_plug_bio_merged(struct bio *bio) { } static inline void blk_zone_write_plug_init_request(struct request *rq) { } static inline void blk_zone_update_request_bio(struct request *rq, struct bio *bio) { } static inline void blk_zone_bio_endio(struct bio *bio) { } static inline void blk_zone_finish_request(struct request *rq) { } static inline int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, unsigned long arg) { return -ENOTTY; } static inline int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { return -ENOTTY; } #endif /* CONFIG_BLK_DEV_ZONED */ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno); void bdev_add(struct block_device *bdev, dev_t dev); void bdev_unhash(struct block_device *bdev); void bdev_drop(struct block_device *bdev); int blk_alloc_ext_minor(void); void blk_free_ext_minor(unsigned int minor); #define ADDPART_FLAG_NONE 0 #define ADDPART_FLAG_RAID 1 #define ADDPART_FLAG_WHOLEDISK 2 #define ADDPART_FLAG_READONLY 4 int bdev_add_partition(struct gendisk *disk, int partno, sector_t start, sector_t length); int bdev_del_partition(struct gendisk *disk, int partno); int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start, sector_t length); void drop_partition(struct block_device *part); void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors); struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, struct lock_class_key *lkclass); /* * Clean up a page appropriately, where the page may be pinned, may have a * ref taken on it or neither. */ static inline void bio_release_page(struct bio *bio, struct page *page) { if (bio_flagged(bio, BIO_PAGE_PINNED)) unpin_user_page(page); } struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id); int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode); int disk_alloc_events(struct gendisk *disk); void disk_add_events(struct gendisk *disk); void disk_del_events(struct gendisk *disk); void disk_release_events(struct gendisk *disk); void disk_block_events(struct gendisk *disk); void disk_unblock_events(struct gendisk *disk); void disk_flush_events(struct gendisk *disk, unsigned int mask); extern struct device_attribute dev_attr_events; extern struct device_attribute dev_attr_events_async; extern struct device_attribute dev_attr_events_poll_msecs; extern struct attribute_group blk_trace_attr_group; blk_mode_t file_to_blk_mode(struct file *file); int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode, loff_t lstart, loff_t lend); long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg); int blkdev_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags); long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg); extern const struct address_space_operations def_blk_aops; int disk_register_independent_access_ranges(struct gendisk *disk); void disk_unregister_independent_access_ranges(struct gendisk *disk); #ifdef CONFIG_FAIL_MAKE_REQUEST bool should_fail_request(struct block_device *part, unsigned int bytes); #else /* CONFIG_FAIL_MAKE_REQUEST */ static inline bool should_fail_request(struct block_device *part, unsigned int bytes) { return false; } #endif /* CONFIG_FAIL_MAKE_REQUEST */ /* * Optimized request reference counting. Ideally we'd make timeouts be more * clever, as that's the only reason we need references at all... But until * this happens, this is faster than using refcount_t. Also see: * * abc54d634334 ("io_uring: switch to atomic_t for io_kiocb reference count") */ #define req_ref_zero_or_close_to_overflow(req) \ ((unsigned int) atomic_read(&(req->ref)) + 127u <= 127u) static inline bool req_ref_inc_not_zero(struct request *req) { return atomic_inc_not_zero(&req->ref); } static inline bool req_ref_put_and_test(struct request *req) { WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); return atomic_dec_and_test(&req->ref); } static inline void req_ref_set(struct request *req, int value) { atomic_set(&req->ref, value); } static inline int req_ref_read(struct request *req) { return atomic_read(&req->ref); } static inline u64 blk_time_get_ns(void) { struct blk_plug *plug = current->plug; if (!plug || !in_task()) return ktime_get_ns(); /* * 0 could very well be a valid time, but rather than flag "this is * a valid timestamp" separately, just accept that we'll do an extra * ktime_get_ns() if we just happen to get 0 as the current time. */ if (!plug->cur_ktime) { plug->cur_ktime = ktime_get_ns(); current->flags |= PF_BLOCK_TS; } return plug->cur_ktime; } static inline ktime_t blk_time_get(void) { return ns_to_ktime(blk_time_get_ns()); } /* * From most significant bit: * 1 bit: reserved for other usage, see below * 12 bits: original size of bio * 51 bits: issue time of bio */ #define BIO_ISSUE_RES_BITS 1 #define BIO_ISSUE_SIZE_BITS 12 #define BIO_ISSUE_RES_SHIFT (64 - BIO_ISSUE_RES_BITS) #define BIO_ISSUE_SIZE_SHIFT (BIO_ISSUE_RES_SHIFT - BIO_ISSUE_SIZE_BITS) #define BIO_ISSUE_TIME_MASK ((1ULL << BIO_ISSUE_SIZE_SHIFT) - 1) #define BIO_ISSUE_SIZE_MASK \ (((1ULL << BIO_ISSUE_SIZE_BITS) - 1) << BIO_ISSUE_SIZE_SHIFT) #define BIO_ISSUE_RES_MASK (~((1ULL << BIO_ISSUE_RES_SHIFT) - 1)) /* Reserved bit for blk-throtl */ #define BIO_ISSUE_THROTL_SKIP_LATENCY (1ULL << 63) static inline u64 __bio_issue_time(u64 time) { return time & BIO_ISSUE_TIME_MASK; } static inline u64 bio_issue_time(struct bio_issue *issue) { return __bio_issue_time(issue->value); } static inline sector_t bio_issue_size(struct bio_issue *issue) { return ((issue->value & BIO_ISSUE_SIZE_MASK) >> BIO_ISSUE_SIZE_SHIFT); } static inline void bio_issue_init(struct bio_issue *issue, sector_t size) { size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1; issue->value = ((issue->value & BIO_ISSUE_RES_MASK) | (blk_time_get_ns() & BIO_ISSUE_TIME_MASK) | ((u64)size << BIO_ISSUE_SIZE_SHIFT)); } void bdev_release(struct file *bdev_file); int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops, struct file *bdev_file); int bdev_permission(dev_t dev, blk_mode_t mode, void *holder); void blk_integrity_generate(struct bio *bio); void blk_integrity_verify_iter(struct bio *bio, struct bvec_iter *saved_iter); void blk_integrity_prepare(struct request *rq); void blk_integrity_complete(struct request *rq, unsigned int nr_bytes); #ifdef CONFIG_LOCKDEP static inline void blk_freeze_acquire_lock(struct request_queue *q) { if (!q->mq_freeze_disk_dead) rwsem_acquire(&q->io_lockdep_map, 0, 1, _RET_IP_); if (!q->mq_freeze_queue_dying) rwsem_acquire(&q->q_lockdep_map, 0, 1, _RET_IP_); } static inline void blk_unfreeze_release_lock(struct request_queue *q) { if (!q->mq_freeze_queue_dying) rwsem_release(&q->q_lockdep_map, _RET_IP_); if (!q->mq_freeze_disk_dead) rwsem_release(&q->io_lockdep_map, _RET_IP_); } #else static inline void blk_freeze_acquire_lock(struct request_queue *q) { } static inline void blk_unfreeze_release_lock(struct request_queue *q) { } #endif #endif /* BLK_INTERNAL_H */
10 7 3 10 15 8 15 1 6 15 2 15 10 14 2 13 8 1 3 14 15 15 2 15 10 14 10 2 2 2 11 11 15 5 15 10 1 1 10 10 1 1 7 10 14 1 14 2 1 1 26 5 6 18 4 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright Red Hat Inc. 2017 * * This file is part of the SCTP kernel implementation * * These functions implement sctp diag support. * * Please send any bug reports or fixes you make to the * email addresched(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Xin Long <lucien.xin@gmail.com> */ #include <linux/module.h> #include <linux/inet_diag.h> #include <linux/sock_diag.h> #include <net/sctp/sctp.h> static void sctp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, void *info); /* define some functions to make asoc/ep fill look clean */ static void inet_diag_msg_sctpasoc_fill(struct inet_diag_msg *r, struct sock *sk, struct sctp_association *asoc) { union sctp_addr laddr, paddr; struct dst_entry *dst; struct timer_list *t3_rtx = &asoc->peer.primary_path->T3_rtx_timer; laddr = list_entry(asoc->base.bind_addr.address_list.next, struct sctp_sockaddr_entry, list)->a; paddr = asoc->peer.primary_path->ipaddr; dst = asoc->peer.primary_path->dst; r->idiag_family = sk->sk_family; r->id.idiag_sport = htons(asoc->base.bind_addr.port); r->id.idiag_dport = htons(asoc->peer.port); r->id.idiag_if = dst ? dst->dev->ifindex : 0; sock_diag_save_cookie(sk, r->id.idiag_cookie); #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) { *(struct in6_addr *)r->id.idiag_src = laddr.v6.sin6_addr; *(struct in6_addr *)r->id.idiag_dst = paddr.v6.sin6_addr; } else #endif { memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); r->id.idiag_src[0] = laddr.v4.sin_addr.s_addr; r->id.idiag_dst[0] = paddr.v4.sin_addr.s_addr; } r->idiag_state = asoc->state; if (timer_pending(t3_rtx)) { r->idiag_timer = SCTP_EVENT_TIMEOUT_T3_RTX; r->idiag_retrans = asoc->rtx_data_chunks; r->idiag_expires = jiffies_to_msecs(t3_rtx->expires - jiffies); } } static int inet_diag_msg_sctpladdrs_fill(struct sk_buff *skb, struct list_head *address_list) { struct sctp_sockaddr_entry *laddr; int addrlen = sizeof(struct sockaddr_storage); int addrcnt = 0; struct nlattr *attr; void *info = NULL; list_for_each_entry_rcu(laddr, address_list, list) addrcnt++; attr = nla_reserve(skb, INET_DIAG_LOCALS, addrlen * addrcnt); if (!attr) return -EMSGSIZE; info = nla_data(attr); list_for_each_entry_rcu(laddr, address_list, list) { memcpy(info, &laddr->a, sizeof(laddr->a)); memset(info + sizeof(laddr->a), 0, addrlen - sizeof(laddr->a)); info += addrlen; } return 0; } static int inet_diag_msg_sctpaddrs_fill(struct sk_buff *skb, struct sctp_association *asoc) { int addrlen = sizeof(struct sockaddr_storage); struct sctp_transport *from; struct nlattr *attr; void *info = NULL; attr = nla_reserve(skb, INET_DIAG_PEERS, addrlen * asoc->peer.transport_count); if (!attr) return -EMSGSIZE; info = nla_data(attr); list_for_each_entry(from, &asoc->peer.transport_addr_list, transports) { memcpy(info, &from->ipaddr, sizeof(from->ipaddr)); memset(info + sizeof(from->ipaddr), 0, addrlen - sizeof(from->ipaddr)); info += addrlen; } return 0; } /* sctp asoc/ep fill*/ static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc, struct sk_buff *skb, const struct inet_diag_req_v2 *req, struct user_namespace *user_ns, int portid, u32 seq, u16 nlmsg_flags, const struct nlmsghdr *unlh, bool net_admin) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct list_head *addr_list; struct inet_diag_msg *r; struct nlmsghdr *nlh; int ext = req->idiag_ext; struct sctp_infox infox; void *info = NULL; nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r), nlmsg_flags); if (!nlh) return -EMSGSIZE; r = nlmsg_data(nlh); BUG_ON(!sk_fullsock(sk)); r->idiag_timer = 0; r->idiag_retrans = 0; r->idiag_expires = 0; if (asoc) { inet_diag_msg_sctpasoc_fill(r, sk, asoc); } else { inet_diag_msg_common_fill(r, sk); r->idiag_state = sk->sk_state; } if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns, net_admin)) goto errout; if (ext & (1 << (INET_DIAG_SKMEMINFO - 1))) { u32 mem[SK_MEMINFO_VARS]; int amt; if (asoc && asoc->ep->sndbuf_policy) amt = asoc->sndbuf_used; else amt = sk_wmem_alloc_get(sk); mem[SK_MEMINFO_WMEM_ALLOC] = amt; if (asoc && asoc->ep->rcvbuf_policy) amt = atomic_read(&asoc->rmem_alloc); else amt = sk_rmem_alloc_get(sk); mem[SK_MEMINFO_RMEM_ALLOC] = amt; mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf; mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf; mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); if (nla_put(skb, INET_DIAG_SKMEMINFO, sizeof(mem), &mem) < 0) goto errout; } if (ext & (1 << (INET_DIAG_INFO - 1))) { struct nlattr *attr; attr = nla_reserve_64bit(skb, INET_DIAG_INFO, sizeof(struct sctp_info), INET_DIAG_PAD); if (!attr) goto errout; info = nla_data(attr); } infox.sctpinfo = (struct sctp_info *)info; infox.asoc = asoc; sctp_diag_get_info(sk, r, &infox); addr_list = asoc ? &asoc->base.bind_addr.address_list : &ep->base.bind_addr.address_list; if (inet_diag_msg_sctpladdrs_fill(skb, addr_list)) goto errout; if (asoc && (ext & (1 << (INET_DIAG_CONG - 1)))) if (nla_put_string(skb, INET_DIAG_CONG, "reno") < 0) goto errout; if (asoc && inet_diag_msg_sctpaddrs_fill(skb, asoc)) goto errout; nlmsg_end(skb, nlh); return 0; errout: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } /* callback and param */ struct sctp_comm_param { struct sk_buff *skb; struct netlink_callback *cb; const struct inet_diag_req_v2 *r; const struct nlmsghdr *nlh; bool net_admin; }; static size_t inet_assoc_attr_size(struct sctp_association *asoc) { int addrlen = sizeof(struct sockaddr_storage); int addrcnt = 0; struct sctp_sockaddr_entry *laddr; list_for_each_entry_rcu(laddr, &asoc->base.bind_addr.address_list, list) addrcnt++; return nla_total_size(sizeof(struct sctp_info)) + nla_total_size(addrlen * asoc->peer.transport_count) + nla_total_size(addrlen * addrcnt) + nla_total_size(sizeof(struct inet_diag_msg)) + inet_diag_msg_attrs_size() + nla_total_size(sizeof(struct inet_diag_meminfo)) + 64; } static int sctp_sock_dump_one(struct sctp_endpoint *ep, struct sctp_transport *tsp, void *p) { struct sctp_association *assoc = tsp->asoc; struct sctp_comm_param *commp = p; struct sock *sk = ep->base.sk; const struct inet_diag_req_v2 *req = commp->r; struct sk_buff *skb = commp->skb; struct sk_buff *rep; int err; err = sock_diag_check_cookie(sk, req->id.idiag_cookie); if (err) return err; rep = nlmsg_new(inet_assoc_attr_size(assoc), GFP_KERNEL); if (!rep) return -ENOMEM; lock_sock(sk); if (ep != assoc->ep) { err = -EAGAIN; goto out; } err = inet_sctp_diag_fill(sk, assoc, rep, req, sk_user_ns(NETLINK_CB(skb).sk), NETLINK_CB(skb).portid, commp->nlh->nlmsg_seq, 0, commp->nlh, commp->net_admin); if (err < 0) { WARN_ON(err == -EMSGSIZE); goto out; } release_sock(sk); return nlmsg_unicast(sock_net(skb->sk)->diag_nlsk, rep, NETLINK_CB(skb).portid); out: release_sock(sk); kfree_skb(rep); return err; } static int sctp_sock_dump(struct sctp_endpoint *ep, struct sctp_transport *tsp, void *p) { struct sctp_comm_param *commp = p; struct sock *sk = ep->base.sk; struct sk_buff *skb = commp->skb; struct netlink_callback *cb = commp->cb; const struct inet_diag_req_v2 *r = commp->r; struct sctp_association *assoc; int err = 0; lock_sock(sk); if (ep != tsp->asoc->ep) goto release; list_for_each_entry(assoc, &ep->asocs, asocs) { if (cb->args[4] < cb->args[1]) goto next; if (r->id.idiag_sport != htons(assoc->base.bind_addr.port) && r->id.idiag_sport) goto next; if (r->id.idiag_dport != htons(assoc->peer.port) && r->id.idiag_dport) goto next; if (!cb->args[3] && inet_sctp_diag_fill(sk, NULL, skb, r, sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh, commp->net_admin) < 0) { err = 1; goto release; } cb->args[3] = 1; if (inet_sctp_diag_fill(sk, assoc, skb, r, sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 0, cb->nlh, commp->net_admin) < 0) { err = 1; goto release; } next: cb->args[4]++; } cb->args[1] = 0; cb->args[3] = 0; cb->args[4] = 0; release: release_sock(sk); return err; } static int sctp_sock_filter(struct sctp_endpoint *ep, struct sctp_transport *tsp, void *p) { struct sctp_comm_param *commp = p; struct sock *sk = ep->base.sk; const struct inet_diag_req_v2 *r = commp->r; /* find the ep only once through the transports by this condition */ if (!list_is_first(&tsp->asoc->asocs, &ep->asocs)) return 0; if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) return 0; return 1; } static int sctp_ep_dump(struct sctp_endpoint *ep, void *p) { struct sctp_comm_param *commp = p; struct sock *sk = ep->base.sk; struct sk_buff *skb = commp->skb; struct netlink_callback *cb = commp->cb; const struct inet_diag_req_v2 *r = commp->r; struct net *net = sock_net(skb->sk); struct inet_sock *inet = inet_sk(sk); int err = 0; if (!net_eq(sock_net(sk), net)) goto out; if (cb->args[4] < cb->args[1]) goto next; if (!(r->idiag_states & TCPF_LISTEN) && !list_empty(&ep->asocs)) goto next; if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) goto next; if (r->id.idiag_sport != inet->inet_sport && r->id.idiag_sport) goto next; if (r->id.idiag_dport != inet->inet_dport && r->id.idiag_dport) goto next; if (inet_sctp_diag_fill(sk, NULL, skb, r, sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh, commp->net_admin) < 0) { err = 2; goto out; } next: cb->args[4]++; out: return err; } /* define the functions for sctp_diag_handler*/ static void sctp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, void *info) { struct sctp_infox *infox = (struct sctp_infox *)info; if (infox->asoc) { r->idiag_rqueue = atomic_read(&infox->asoc->rmem_alloc); r->idiag_wqueue = infox->asoc->sndbuf_used; } else { r->idiag_rqueue = READ_ONCE(sk->sk_ack_backlog); r->idiag_wqueue = READ_ONCE(sk->sk_max_ack_backlog); } if (infox->sctpinfo) sctp_get_sctp_info(sk, infox->asoc, infox->sctpinfo); } static int sctp_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { struct sk_buff *skb = cb->skb; struct net *net = sock_net(skb->sk); const struct nlmsghdr *nlh = cb->nlh; union sctp_addr laddr, paddr; int dif = req->id.idiag_if; struct sctp_comm_param commp = { .skb = skb, .r = req, .nlh = nlh, .net_admin = netlink_net_capable(skb, CAP_NET_ADMIN), }; if (req->sdiag_family == AF_INET) { laddr.v4.sin_port = req->id.idiag_sport; laddr.v4.sin_addr.s_addr = req->id.idiag_src[0]; laddr.v4.sin_family = AF_INET; paddr.v4.sin_port = req->id.idiag_dport; paddr.v4.sin_addr.s_addr = req->id.idiag_dst[0]; paddr.v4.sin_family = AF_INET; } else { laddr.v6.sin6_port = req->id.idiag_sport; memcpy(&laddr.v6.sin6_addr, req->id.idiag_src, sizeof(laddr.v6.sin6_addr)); laddr.v6.sin6_family = AF_INET6; paddr.v6.sin6_port = req->id.idiag_dport; memcpy(&paddr.v6.sin6_addr, req->id.idiag_dst, sizeof(paddr.v6.sin6_addr)); paddr.v6.sin6_family = AF_INET6; } return sctp_transport_lookup_process(sctp_sock_dump_one, net, &laddr, &paddr, &commp, dif); } static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { u32 idiag_states = r->idiag_states; struct net *net = sock_net(skb->sk); struct sctp_comm_param commp = { .skb = skb, .cb = cb, .r = r, .net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN), }; int pos = cb->args[2]; /* eps hashtable dumps * args: * 0 : if it will traversal listen sock * 1 : to record the sock pos of this time's traversal * 4 : to work as a temporary variable to traversal list */ if (cb->args[0] == 0) { if (!(idiag_states & TCPF_LISTEN)) goto skip; if (sctp_for_each_endpoint(sctp_ep_dump, &commp)) goto done; skip: cb->args[0] = 1; cb->args[1] = 0; cb->args[4] = 0; } /* asocs by transport hashtable dump * args: * 1 : to record the assoc pos of this time's traversal * 2 : to record the transport pos of this time's traversal * 3 : to mark if we have dumped the ep info of the current asoc * 4 : to work as a temporary variable to traversal list * 5 : to save the sk we get from travelsing the tsp list. */ if (!(idiag_states & ~(TCPF_LISTEN | TCPF_CLOSE))) goto done; sctp_transport_traverse_process(sctp_sock_filter, sctp_sock_dump, net, &pos, &commp); cb->args[2] = pos; done: cb->args[1] = cb->args[4]; cb->args[4] = 0; } static const struct inet_diag_handler sctp_diag_handler = { .owner = THIS_MODULE, .dump = sctp_diag_dump, .dump_one = sctp_diag_dump_one, .idiag_get_info = sctp_diag_get_info, .idiag_type = IPPROTO_SCTP, .idiag_info_size = sizeof(struct sctp_info), }; static int __init sctp_diag_init(void) { return inet_diag_register(&sctp_diag_handler); } static void __exit sctp_diag_exit(void) { inet_diag_unregister(&sctp_diag_handler); } module_init(sctp_diag_init); module_exit(sctp_diag_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SCTP socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-132);
110 155 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Queued spinlock * * A 'generic' spinlock implementation that is based on MCS locks. For an * architecture that's looking for a 'generic' spinlock, please first consider * ticket-lock.h and only come looking here when you've considered all the * constraints below and can show your hardware does actually perform better * with qspinlock. * * qspinlock relies on atomic_*_release()/atomic_*_acquire() to be RCsc (or no * weaker than RCtso if you're power), where regular code only expects atomic_t * to be RCpc. * * qspinlock relies on a far greater (compared to asm-generic/spinlock.h) set * of atomic operations to behave well together, please audit them carefully to * ensure they all have forward progress. Many atomic operations may default to * cmpxchg() loops which will not have good forward progress properties on * LL/SC architectures. * * One notable example is atomic_fetch_or_acquire(), which x86 cannot (cheaply) * do. Carefully read the patches that introduced * queued_fetch_set_pending_acquire(). * * qspinlock also heavily relies on mixed size atomic operations, in specific * it requires architectures to have xchg16; something which many LL/SC * architectures need to implement as a 32bit and+or in order to satisfy the * forward progress guarantees mentioned above. * * Further reading on mixed size atomics that might be relevant: * * http://www.cl.cam.ac.uk/~pes20/popl17/mixed-size.pdf * * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP * * Authors: Waiman Long <waiman.long@hpe.com> */ #ifndef __ASM_GENERIC_QSPINLOCK_H #define __ASM_GENERIC_QSPINLOCK_H #include <asm-generic/qspinlock_types.h> #include <linux/atomic.h> #ifndef queued_spin_is_locked /** * queued_spin_is_locked - is the spinlock locked? * @lock: Pointer to queued spinlock structure * Return: 1 if it is locked, 0 otherwise */ static __always_inline int queued_spin_is_locked(struct qspinlock *lock) { /* * Any !0 state indicates it is locked, even if _Q_LOCKED_VAL * isn't immediately observable. */ return atomic_read(&lock->val); } #endif /** * queued_spin_value_unlocked - is the spinlock structure unlocked? * @lock: queued spinlock structure * Return: 1 if it is unlocked, 0 otherwise * * N.B. Whenever there are tasks waiting for the lock, it is considered * locked wrt the lockref code to avoid lock stealing by the lockref * code and change things underneath the lock. This also allows some * optimizations to be applied without conflict with lockref. */ static __always_inline int queued_spin_value_unlocked(struct qspinlock lock) { return !lock.val.counter; } /** * queued_spin_is_contended - check if the lock is contended * @lock : Pointer to queued spinlock structure * Return: 1 if lock contended, 0 otherwise */ static __always_inline int queued_spin_is_contended(struct qspinlock *lock) { return atomic_read(&lock->val) & ~_Q_LOCKED_MASK; } /** * queued_spin_trylock - try to acquire the queued spinlock * @lock : Pointer to queued spinlock structure * Return: 1 if lock acquired, 0 if failed */ static __always_inline int queued_spin_trylock(struct qspinlock *lock) { int val = atomic_read(&lock->val); if (unlikely(val)) return 0; return likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)); } extern void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); #ifndef queued_spin_lock /** * queued_spin_lock - acquire a queued spinlock * @lock: Pointer to queued spinlock structure */ static __always_inline void queued_spin_lock(struct qspinlock *lock) { int val = 0; if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) return; queued_spin_lock_slowpath(lock, val); } #endif #ifndef queued_spin_unlock /** * queued_spin_unlock - release a queued spinlock * @lock : Pointer to queued spinlock structure */ static __always_inline void queued_spin_unlock(struct qspinlock *lock) { /* * unlock() needs release semantics: */ smp_store_release(&lock->locked, 0); } #endif #ifndef virt_spin_lock static __always_inline bool virt_spin_lock(struct qspinlock *lock) { return false; } #endif #ifndef __no_arch_spinlock_redefine /* * Remapping spinlock architecture specific functions to the corresponding * queued spinlock functions. */ #define arch_spin_is_locked(l) queued_spin_is_locked(l) #define arch_spin_is_contended(l) queued_spin_is_contended(l) #define arch_spin_value_unlocked(l) queued_spin_value_unlocked(l) #define arch_spin_lock(l) queued_spin_lock(l) #define arch_spin_trylock(l) queued_spin_trylock(l) #define arch_spin_unlock(l) queued_spin_unlock(l) #endif #endif /* __ASM_GENERIC_QSPINLOCK_H */
9 9 18 1 2 4 11 2 9 1 9 4 4 3 4 3 2 3 90 90 11 49 30 30 30 30 10 4 17 21 17 4 7 14 90 2 1 1 6 6 2 4 6 5 1 2 1 1 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2016 Facebook */ #include <linux/bpf.h> #include <linux/jhash.h> #include <linux/filter.h> #include <linux/kernel.h> #include <linux/stacktrace.h> #include <linux/perf_event.h> #include <linux/btf_ids.h> #include <linux/buildid.h> #include "percpu_freelist.h" #include "mmap_unlock_work.h" #define STACK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \ BPF_F_STACK_BUILD_ID) struct stack_map_bucket { struct pcpu_freelist_node fnode; u32 hash; u32 nr; u64 data[]; }; struct bpf_stack_map { struct bpf_map map; void *elems; struct pcpu_freelist freelist; u32 n_buckets; struct stack_map_bucket *buckets[] __counted_by(n_buckets); }; static inline bool stack_map_use_build_id(struct bpf_map *map) { return (map->map_flags & BPF_F_STACK_BUILD_ID); } static inline int stack_map_data_size(struct bpf_map *map) { return stack_map_use_build_id(map) ? sizeof(struct bpf_stack_build_id) : sizeof(u64); } static int prealloc_elems_and_freelist(struct bpf_stack_map *smap) { u64 elem_size = sizeof(struct stack_map_bucket) + (u64)smap->map.value_size; int err; smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries, smap->map.numa_node); if (!smap->elems) return -ENOMEM; err = pcpu_freelist_init(&smap->freelist); if (err) goto free_elems; pcpu_freelist_populate(&smap->freelist, smap->elems, elem_size, smap->map.max_entries); return 0; free_elems: bpf_map_area_free(smap->elems); return err; } /* Called from syscall */ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) { u32 value_size = attr->value_size; struct bpf_stack_map *smap; u64 cost, n_buckets; int err; if (attr->map_flags & ~STACK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || value_size < 8 || value_size % 8) return ERR_PTR(-EINVAL); BUILD_BUG_ON(sizeof(struct bpf_stack_build_id) % sizeof(u64)); if (attr->map_flags & BPF_F_STACK_BUILD_ID) { if (value_size % sizeof(struct bpf_stack_build_id) || value_size / sizeof(struct bpf_stack_build_id) > sysctl_perf_event_max_stack) return ERR_PTR(-EINVAL); } else if (value_size / 8 > sysctl_perf_event_max_stack) return ERR_PTR(-EINVAL); /* hash table size must be power of 2; roundup_pow_of_two() can overflow * into UB on 32-bit arches, so check that first */ if (attr->max_entries > 1UL << 31) return ERR_PTR(-E2BIG); n_buckets = roundup_pow_of_two(attr->max_entries); cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr)); if (!smap) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&smap->map, attr); smap->n_buckets = n_buckets; err = get_callchain_buffers(sysctl_perf_event_max_stack); if (err) goto free_smap; err = prealloc_elems_and_freelist(smap); if (err) goto put_buffers; return &smap->map; put_buffers: put_callchain_buffers(); free_smap: bpf_map_area_free(smap); return ERR_PTR(err); } static int fetch_build_id(struct vm_area_struct *vma, unsigned char *build_id, bool may_fault) { return may_fault ? build_id_parse(vma, build_id, NULL) : build_id_parse_nofault(vma, build_id, NULL); } /* * Expects all id_offs[i].ip values to be set to correct initial IPs. * They will be subsequently: * - either adjusted in place to a file offset, if build ID fetching * succeeds; in this case id_offs[i].build_id is set to correct build ID, * and id_offs[i].status is set to BPF_STACK_BUILD_ID_VALID; * - or IP will be kept intact, if build ID fetching failed; in this case * id_offs[i].build_id is zeroed out and id_offs[i].status is set to * BPF_STACK_BUILD_ID_IP. */ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, u32 trace_nr, bool user, bool may_fault) { int i; struct mmap_unlock_irq_work *work = NULL; bool irq_work_busy = bpf_mmap_unlock_get_irq_work(&work); struct vm_area_struct *vma, *prev_vma = NULL; const char *prev_build_id; /* If the irq_work is in use, fall back to report ips. Same * fallback is used for kernel stack (!user) on a stackmap with * build_id. */ if (!user || !current || !current->mm || irq_work_busy || !mmap_read_trylock(current->mm)) { /* cannot access current->mm, fall back to ips */ for (i = 0; i < trace_nr; i++) { id_offs[i].status = BPF_STACK_BUILD_ID_IP; memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX); } return; } for (i = 0; i < trace_nr; i++) { u64 ip = READ_ONCE(id_offs[i].ip); if (range_in_vma(prev_vma, ip, ip)) { vma = prev_vma; memcpy(id_offs[i].build_id, prev_build_id, BUILD_ID_SIZE_MAX); goto build_id_valid; } vma = find_vma(current->mm, ip); if (!vma || fetch_build_id(vma, id_offs[i].build_id, may_fault)) { /* per entry fall back to ips */ id_offs[i].status = BPF_STACK_BUILD_ID_IP; memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX); continue; } build_id_valid: id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ip - vma->vm_start; id_offs[i].status = BPF_STACK_BUILD_ID_VALID; prev_vma = vma; prev_build_id = id_offs[i].build_id; } bpf_mmap_unlock_mm(work, current->mm); } static struct perf_callchain_entry * get_callchain_entry_for_task(struct task_struct *task, u32 max_depth) { #ifdef CONFIG_STACKTRACE struct perf_callchain_entry *entry; int rctx; entry = get_callchain_entry(&rctx); if (!entry) return NULL; entry->nr = stack_trace_save_tsk(task, (unsigned long *)entry->ip, max_depth, 0); /* stack_trace_save_tsk() works on unsigned long array, while * perf_callchain_entry uses u64 array. For 32-bit systems, it is * necessary to fix this mismatch. */ if (__BITS_PER_LONG != 64) { unsigned long *from = (unsigned long *) entry->ip; u64 *to = entry->ip; int i; /* copy data from the end to avoid using extra buffer */ for (i = entry->nr - 1; i >= 0; i--) to[i] = (u64)(from[i]); } put_callchain_entry(rctx); return entry; #else /* CONFIG_STACKTRACE */ return NULL; #endif } static long __bpf_get_stackid(struct bpf_map *map, struct perf_callchain_entry *trace, u64 flags) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct stack_map_bucket *bucket, *new_bucket, *old_bucket; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; u32 hash, id, trace_nr, trace_len, i; bool user = flags & BPF_F_USER_STACK; u64 *ips; bool hash_matches; if (trace->nr <= skip) /* skipping more than usable stack trace */ return -EFAULT; trace_nr = trace->nr - skip; trace_len = trace_nr * sizeof(u64); ips = trace->ip + skip; hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0); id = hash & (smap->n_buckets - 1); bucket = READ_ONCE(smap->buckets[id]); hash_matches = bucket && bucket->hash == hash; /* fast cmp */ if (hash_matches && flags & BPF_F_FAST_STACK_CMP) return id; if (stack_map_use_build_id(map)) { struct bpf_stack_build_id *id_offs; /* for build_id+offset, pop a bucket before slow cmp */ new_bucket = (struct stack_map_bucket *) pcpu_freelist_pop(&smap->freelist); if (unlikely(!new_bucket)) return -ENOMEM; new_bucket->nr = trace_nr; id_offs = (struct bpf_stack_build_id *)new_bucket->data; for (i = 0; i < trace_nr; i++) id_offs[i].ip = ips[i]; stack_map_get_build_id_offset(id_offs, trace_nr, user, false /* !may_fault */); trace_len = trace_nr * sizeof(struct bpf_stack_build_id); if (hash_matches && bucket->nr == trace_nr && memcmp(bucket->data, new_bucket->data, trace_len) == 0) { pcpu_freelist_push(&smap->freelist, &new_bucket->fnode); return id; } if (bucket && !(flags & BPF_F_REUSE_STACKID)) { pcpu_freelist_push(&smap->freelist, &new_bucket->fnode); return -EEXIST; } } else { if (hash_matches && bucket->nr == trace_nr && memcmp(bucket->data, ips, trace_len) == 0) return id; if (bucket && !(flags & BPF_F_REUSE_STACKID)) return -EEXIST; new_bucket = (struct stack_map_bucket *) pcpu_freelist_pop(&smap->freelist); if (unlikely(!new_bucket)) return -ENOMEM; memcpy(new_bucket->data, ips, trace_len); } new_bucket->hash = hash; new_bucket->nr = trace_nr; old_bucket = xchg(&smap->buckets[id], new_bucket); if (old_bucket) pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); return id; } BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, u64, flags) { u32 max_depth = map->value_size / stack_map_data_size(map); u32 skip = flags & BPF_F_SKIP_FIELD_MASK; bool user = flags & BPF_F_USER_STACK; struct perf_callchain_entry *trace; bool kernel = !user; if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) return -EINVAL; max_depth += skip; if (max_depth > sysctl_perf_event_max_stack) max_depth = sysctl_perf_event_max_stack; trace = get_perf_callchain(regs, 0, kernel, user, max_depth, false, false); if (unlikely(!trace)) /* couldn't fetch the stack trace */ return -EFAULT; return __bpf_get_stackid(map, trace, flags); } const struct bpf_func_proto bpf_get_stackid_proto = { .func = bpf_get_stackid, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, }; static __u64 count_kernel_ip(struct perf_callchain_entry *trace) { __u64 nr_kernel = 0; while (nr_kernel < trace->nr) { if (trace->ip[nr_kernel] == PERF_CONTEXT_USER) break; nr_kernel++; } return nr_kernel; } BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx, struct bpf_map *, map, u64, flags) { struct perf_event *event = ctx->event; struct perf_callchain_entry *trace; bool kernel, user; __u64 nr_kernel; int ret; /* perf_sample_data doesn't have callchain, use bpf_get_stackid */ if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)) return bpf_get_stackid((unsigned long)(ctx->regs), (unsigned long) map, flags, 0, 0); if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) return -EINVAL; user = flags & BPF_F_USER_STACK; kernel = !user; trace = ctx->data->callchain; if (unlikely(!trace)) return -EFAULT; nr_kernel = count_kernel_ip(trace); if (kernel) { __u64 nr = trace->nr; trace->nr = nr_kernel; ret = __bpf_get_stackid(map, trace, flags); /* restore nr */ trace->nr = nr; } else { /* user */ u64 skip = flags & BPF_F_SKIP_FIELD_MASK; skip += nr_kernel; if (skip > BPF_F_SKIP_FIELD_MASK) return -EFAULT; flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip; ret = __bpf_get_stackid(map, trace, flags); } return ret; } const struct bpf_func_proto bpf_get_stackid_proto_pe = { .func = bpf_get_stackid_pe, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, }; static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, struct perf_callchain_entry *trace_in, void *buf, u32 size, u64 flags, bool may_fault) { u32 trace_nr, copy_len, elem_size, num_elem, max_depth; bool user_build_id = flags & BPF_F_USER_BUILD_ID; bool crosstask = task && task != current; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; bool user = flags & BPF_F_USER_STACK; struct perf_callchain_entry *trace; bool kernel = !user; int err = -EINVAL; u64 *ips; if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | BPF_F_USER_BUILD_ID))) goto clear; if (kernel && user_build_id) goto clear; elem_size = user_build_id ? sizeof(struct bpf_stack_build_id) : sizeof(u64); if (unlikely(size % elem_size)) goto clear; /* cannot get valid user stack for task without user_mode regs */ if (task && user && !user_mode(regs)) goto err_fault; /* get_perf_callchain does not support crosstask user stack walking * but returns an empty stack instead of NULL. */ if (crosstask && user) { err = -EOPNOTSUPP; goto clear; } num_elem = size / elem_size; max_depth = num_elem + skip; if (sysctl_perf_event_max_stack < max_depth) max_depth = sysctl_perf_event_max_stack; if (may_fault) rcu_read_lock(); /* need RCU for perf's callchain below */ if (trace_in) trace = trace_in; else if (kernel && task) trace = get_callchain_entry_for_task(task, max_depth); else trace = get_perf_callchain(regs, 0, kernel, user, max_depth, crosstask, false); if (unlikely(!trace) || trace->nr < skip) { if (may_fault) rcu_read_unlock(); goto err_fault; } trace_nr = trace->nr - skip; trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem; copy_len = trace_nr * elem_size; ips = trace->ip + skip; if (user_build_id) { struct bpf_stack_build_id *id_offs = buf; u32 i; for (i = 0; i < trace_nr; i++) id_offs[i].ip = ips[i]; } else { memcpy(buf, ips, copy_len); } /* trace/ips should not be dereferenced after this point */ if (may_fault) rcu_read_unlock(); if (user_build_id) stack_map_get_build_id_offset(buf, trace_nr, user, may_fault); if (size > copy_len) memset(buf + copy_len, 0, size - copy_len); return copy_len; err_fault: err = -EFAULT; clear: memset(buf, 0, size); return err; } BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, u64, flags) { return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, false /* !may_fault */); } const struct bpf_func_proto bpf_get_stack_proto = { .func = bpf_get_stack, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_get_stack_sleepable, struct pt_regs *, regs, void *, buf, u32, size, u64, flags) { return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, true /* may_fault */); } const struct bpf_func_proto bpf_get_stack_sleepable_proto = { .func = bpf_get_stack_sleepable, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; static long __bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags, bool may_fault) { struct pt_regs *regs; long res = -EINVAL; if (!try_get_task_stack(task)) return -EFAULT; regs = task_pt_regs(task); if (regs) res = __bpf_get_stack(regs, task, NULL, buf, size, flags, may_fault); put_task_stack(task); return res; } BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, u32, size, u64, flags) { return __bpf_get_task_stack(task, buf, size, flags, false /* !may_fault */); } const struct bpf_func_proto bpf_get_task_stack_proto = { .func = bpf_get_task_stack, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_get_task_stack_sleepable, struct task_struct *, task, void *, buf, u32, size, u64, flags) { return __bpf_get_task_stack(task, buf, size, flags, true /* !may_fault */); } const struct bpf_func_proto bpf_get_task_stack_sleepable_proto = { .func = bpf_get_task_stack_sleepable, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, void *, buf, u32, size, u64, flags) { struct pt_regs *regs = (struct pt_regs *)(ctx->regs); struct perf_event *event = ctx->event; struct perf_callchain_entry *trace; bool kernel, user; int err = -EINVAL; __u64 nr_kernel; if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)) return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, false /* !may_fault */); if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | BPF_F_USER_BUILD_ID))) goto clear; user = flags & BPF_F_USER_STACK; kernel = !user; err = -EFAULT; trace = ctx->data->callchain; if (unlikely(!trace)) goto clear; nr_kernel = count_kernel_ip(trace); if (kernel) { __u64 nr = trace->nr; trace->nr = nr_kernel; err = __bpf_get_stack(regs, NULL, trace, buf, size, flags, false /* !may_fault */); /* restore nr */ trace->nr = nr; } else { /* user */ u64 skip = flags & BPF_F_SKIP_FIELD_MASK; skip += nr_kernel; if (skip > BPF_F_SKIP_FIELD_MASK) goto clear; flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip; err = __bpf_get_stack(regs, NULL, trace, buf, size, flags, false /* !may_fault */); } return err; clear: memset(buf, 0, size); return err; } const struct bpf_func_proto bpf_get_stack_proto_pe = { .func = bpf_get_stack_pe, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; /* Called from eBPF program */ static void *stack_map_lookup_elem(struct bpf_map *map, void *key) { return ERR_PTR(-EOPNOTSUPP); } /* Called from syscall */ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct stack_map_bucket *bucket, *old_bucket; u32 id = *(u32 *)key, trace_len; if (unlikely(id >= smap->n_buckets)) return -ENOENT; bucket = xchg(&smap->buckets[id], NULL); if (!bucket) return -ENOENT; trace_len = bucket->nr * stack_map_data_size(map); memcpy(value, bucket->data, trace_len); memset(value + trace_len, 0, map->value_size - trace_len); old_bucket = xchg(&smap->buckets[id], bucket); if (old_bucket) pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); return 0; } static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); u32 id; WARN_ON_ONCE(!rcu_read_lock_held()); if (!key) { id = 0; } else { id = *(u32 *)key; if (id >= smap->n_buckets || !smap->buckets[id]) id = 0; else id++; } while (id < smap->n_buckets && !smap->buckets[id]) id++; if (id >= smap->n_buckets) return -ENOENT; *(u32 *)next_key = id; return 0; } static long stack_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { return -EINVAL; } /* Called from syscall or from eBPF program */ static long stack_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct stack_map_bucket *old_bucket; u32 id = *(u32 *)key; if (unlikely(id >= smap->n_buckets)) return -E2BIG; old_bucket = xchg(&smap->buckets[id], NULL); if (old_bucket) { pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); return 0; } else { return -ENOENT; } } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ static void stack_map_free(struct bpf_map *map) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); bpf_map_area_free(smap->elems); pcpu_freelist_destroy(&smap->freelist); bpf_map_area_free(smap); put_callchain_buffers(); } static u64 stack_map_mem_usage(const struct bpf_map *map) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); u64 value_size = map->value_size; u64 n_buckets = smap->n_buckets; u64 enties = map->max_entries; u64 usage = sizeof(*smap); usage += n_buckets * sizeof(struct stack_map_bucket *); usage += enties * (sizeof(struct stack_map_bucket) + value_size); return usage; } BTF_ID_LIST_SINGLE(stack_trace_map_btf_ids, struct, bpf_stack_map) const struct bpf_map_ops stack_trace_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = stack_map_alloc, .map_free = stack_map_free, .map_get_next_key = stack_map_get_next_key, .map_lookup_elem = stack_map_lookup_elem, .map_update_elem = stack_map_update_elem, .map_delete_elem = stack_map_delete_elem, .map_check_btf = map_check_no_btf, .map_mem_usage = stack_map_mem_usage, .map_btf_id = &stack_trace_map_btf_ids[0], };
17 7 1 1 5 2 2 1 1 2 6 4 1 1 5 5 10 10 63 5 58 7 7 1 1 1 1 1 1 5 5 5 5 60 60 60 60 60 60 60 28 28 48 48 48 3 35 35 35 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 // SPDX-License-Identifier: GPL-2.0-only /* * net/sched/sch_drr.c Deficit Round Robin scheduler * * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> */ #include <linux/module.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/pkt_sched.h> #include <net/sch_generic.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> struct drr_class { struct Qdisc_class_common common; struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; struct net_rate_estimator __rcu *rate_est; struct list_head alist; struct Qdisc *qdisc; u32 quantum; u32 deficit; }; struct drr_sched { struct list_head active; struct tcf_proto __rcu *filter_list; struct tcf_block *block; struct Qdisc_class_hash clhash; }; static bool cl_is_active(struct drr_class *cl) { return !list_empty(&cl->alist); } static struct drr_class *drr_find_class(struct Qdisc *sch, u32 classid) { struct drr_sched *q = qdisc_priv(sch); struct Qdisc_class_common *clc; clc = qdisc_class_find(&q->clhash, classid); if (clc == NULL) return NULL; return container_of(clc, struct drr_class, common); } static const struct nla_policy drr_policy[TCA_DRR_MAX + 1] = { [TCA_DRR_QUANTUM] = { .type = NLA_U32 }, }; static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca, unsigned long *arg, struct netlink_ext_ack *extack) { struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl = (struct drr_class *)*arg; struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_DRR_MAX + 1]; u32 quantum; int err; if (!opt) { NL_SET_ERR_MSG(extack, "DRR options are required for this operation"); return -EINVAL; } err = nla_parse_nested_deprecated(tb, TCA_DRR_MAX, opt, drr_policy, extack); if (err < 0) return err; if (tb[TCA_DRR_QUANTUM]) { quantum = nla_get_u32(tb[TCA_DRR_QUANTUM]); if (quantum == 0) { NL_SET_ERR_MSG(extack, "Specified DRR quantum cannot be zero"); return -EINVAL; } } else quantum = psched_mtu(qdisc_dev(sch)); if (cl != NULL) { if (tca[TCA_RATE]) { err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, NULL, true, tca[TCA_RATE]); if (err) { NL_SET_ERR_MSG(extack, "Failed to replace estimator"); return err; } } sch_tree_lock(sch); if (tb[TCA_DRR_QUANTUM]) cl->quantum = quantum; sch_tree_unlock(sch); return 0; } cl = kzalloc(sizeof(struct drr_class), GFP_KERNEL); if (cl == NULL) return -ENOBUFS; gnet_stats_basic_sync_init(&cl->bstats); INIT_LIST_HEAD(&cl->alist); cl->common.classid = classid; cl->quantum = quantum; cl->qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid, NULL); if (cl->qdisc == NULL) cl->qdisc = &noop_qdisc; else qdisc_hash_add(cl->qdisc, true); if (tca[TCA_RATE]) { err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, NULL, true, tca[TCA_RATE]); if (err) { NL_SET_ERR_MSG(extack, "Failed to replace estimator"); qdisc_put(cl->qdisc); kfree(cl); return err; } } sch_tree_lock(sch); qdisc_class_hash_insert(&q->clhash, &cl->common); sch_tree_unlock(sch); qdisc_class_hash_grow(sch, &q->clhash); *arg = (unsigned long)cl; return 0; } static void drr_destroy_class(struct Qdisc *sch, struct drr_class *cl) { gen_kill_estimator(&cl->rate_est); qdisc_put(cl->qdisc); kfree(cl); } static int drr_delete_class(struct Qdisc *sch, unsigned long arg, struct netlink_ext_ack *extack) { struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl = (struct drr_class *)arg; if (qdisc_class_in_use(&cl->common)) { NL_SET_ERR_MSG(extack, "DRR class is in use"); return -EBUSY; } sch_tree_lock(sch); qdisc_purge_queue(cl->qdisc); qdisc_class_hash_remove(&q->clhash, &cl->common); sch_tree_unlock(sch); drr_destroy_class(sch, cl); return 0; } static unsigned long drr_search_class(struct Qdisc *sch, u32 classid) { return (unsigned long)drr_find_class(sch, classid); } static struct tcf_block *drr_tcf_block(struct Qdisc *sch, unsigned long cl, struct netlink_ext_ack *extack) { struct drr_sched *q = qdisc_priv(sch); if (cl) { NL_SET_ERR_MSG(extack, "DRR classid must be zero"); return NULL; } return q->block; } static unsigned long drr_bind_tcf(struct Qdisc *sch, unsigned long parent, u32 classid) { struct drr_class *cl = drr_find_class(sch, classid); if (cl) qdisc_class_get(&cl->common); return (unsigned long)cl; } static void drr_unbind_tcf(struct Qdisc *sch, unsigned long arg) { struct drr_class *cl = (struct drr_class *)arg; qdisc_class_put(&cl->common); } static int drr_graft_class(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { struct drr_class *cl = (struct drr_class *)arg; if (new == NULL) { new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, cl->common.classid, NULL); if (new == NULL) new = &noop_qdisc; } *old = qdisc_replace(sch, new, &cl->qdisc); return 0; } static struct Qdisc *drr_class_leaf(struct Qdisc *sch, unsigned long arg) { struct drr_class *cl = (struct drr_class *)arg; return cl->qdisc; } static void drr_qlen_notify(struct Qdisc *csh, unsigned long arg) { struct drr_class *cl = (struct drr_class *)arg; list_del_init(&cl->alist); } static int drr_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb, struct tcmsg *tcm) { struct drr_class *cl = (struct drr_class *)arg; struct nlattr *nest; tcm->tcm_parent = TC_H_ROOT; tcm->tcm_handle = cl->common.classid; tcm->tcm_info = cl->qdisc->handle; nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; if (nla_put_u32(skb, TCA_DRR_QUANTUM, cl->quantum)) goto nla_put_failure; return nla_nest_end(skb, nest); nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) { struct drr_class *cl = (struct drr_class *)arg; __u32 qlen = qdisc_qlen_sum(cl->qdisc); struct Qdisc *cl_q = cl->qdisc; struct tc_drr_stats xstats; memset(&xstats, 0, sizeof(xstats)); if (qlen) xstats.deficit = cl->deficit; if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, cl_q->cpu_qstats, &cl_q->qstats, qlen) < 0) return -1; return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); } static void drr_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl; unsigned int i; if (arg->stop) return; for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { if (!tc_qdisc_stats_dump(sch, (unsigned long)cl, arg)) return; } } } static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) { struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl; struct tcf_result res; struct tcf_proto *fl; int result; if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) { cl = drr_find_class(sch, skb->priority); if (cl != NULL) return cl; } *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; fl = rcu_dereference_bh(q->filter_list); result = tcf_classify(skb, NULL, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { case TC_ACT_QUEUED: case TC_ACT_STOLEN: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; fallthrough; case TC_ACT_SHOT: return NULL; } #endif cl = (struct drr_class *)res.class; if (cl == NULL) cl = drr_find_class(sch, res.classid); return cl; } return NULL; } static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { unsigned int len = qdisc_pkt_len(skb); struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl; int err = 0; cl = drr_classify(skb, sch, &err); if (cl == NULL) { if (err & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); __qdisc_drop(skb, to_free); return err; } err = qdisc_enqueue(skb, cl->qdisc, to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { if (net_xmit_drop_count(err)) { cl->qstats.drops++; qdisc_qstats_drop(sch); } return err; } if (!cl_is_active(cl)) { list_add_tail(&cl->alist, &q->active); cl->deficit = cl->quantum; } sch->qstats.backlog += len; sch->q.qlen++; return err; } static struct sk_buff *drr_dequeue(struct Qdisc *sch) { struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl; struct sk_buff *skb; unsigned int len; if (list_empty(&q->active)) goto out; while (1) { cl = list_first_entry(&q->active, struct drr_class, alist); skb = cl->qdisc->ops->peek(cl->qdisc); if (skb == NULL) { qdisc_warn_nonwc(__func__, cl->qdisc); goto out; } len = qdisc_pkt_len(skb); if (len <= cl->deficit) { cl->deficit -= len; skb = qdisc_dequeue_peeked(cl->qdisc); if (unlikely(skb == NULL)) goto out; if (cl->qdisc->q.qlen == 0) list_del_init(&cl->alist); bstats_update(&cl->bstats, skb); qdisc_bstats_update(sch, skb); qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; return skb; } cl->deficit += cl->quantum; list_move_tail(&cl->alist, &q->active); } out: return NULL; } static int drr_init_qdisc(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct drr_sched *q = qdisc_priv(sch); int err; err = tcf_block_get(&q->block, &q->filter_list, sch, extack); if (err) return err; err = qdisc_class_hash_init(&q->clhash); if (err < 0) return err; INIT_LIST_HEAD(&q->active); return 0; } static void drr_reset_qdisc(struct Qdisc *sch) { struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl; unsigned int i; for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { if (cl->qdisc->q.qlen) list_del_init(&cl->alist); qdisc_reset(cl->qdisc); } } } static void drr_destroy_qdisc(struct Qdisc *sch) { struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl; struct hlist_node *next; unsigned int i; tcf_block_put(q->block); for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], common.hnode) drr_destroy_class(sch, cl); } qdisc_class_hash_destroy(&q->clhash); } static const struct Qdisc_class_ops drr_class_ops = { .change = drr_change_class, .delete = drr_delete_class, .find = drr_search_class, .tcf_block = drr_tcf_block, .bind_tcf = drr_bind_tcf, .unbind_tcf = drr_unbind_tcf, .graft = drr_graft_class, .leaf = drr_class_leaf, .qlen_notify = drr_qlen_notify, .dump = drr_dump_class, .dump_stats = drr_dump_class_stats, .walk = drr_walk, }; static struct Qdisc_ops drr_qdisc_ops __read_mostly = { .cl_ops = &drr_class_ops, .id = "drr", .priv_size = sizeof(struct drr_sched), .enqueue = drr_enqueue, .dequeue = drr_dequeue, .peek = qdisc_peek_dequeued, .init = drr_init_qdisc, .reset = drr_reset_qdisc, .destroy = drr_destroy_qdisc, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("drr"); static int __init drr_init(void) { return register_qdisc(&drr_qdisc_ops); } static void __exit drr_exit(void) { unregister_qdisc(&drr_qdisc_ops); } module_init(drr_init); module_exit(drr_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Deficit Round Robin scheduler");
416 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PGTABLE_DEFS_H #define _ASM_X86_PGTABLE_DEFS_H #include <linux/const.h> #include <linux/mem_encrypt.h> #include <asm/page_types.h> #define _PAGE_BIT_PRESENT 0 /* is present */ #define _PAGE_BIT_RW 1 /* writeable */ #define _PAGE_BIT_USER 2 /* userspace addressable */ #define _PAGE_BIT_PWT 3 /* page write through */ #define _PAGE_BIT_PCD 4 /* page cache disabled */ #define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */ #define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */ #define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ #define _PAGE_BIT_PAT 7 /* on 4KB pages */ #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ #define _PAGE_BIT_SOFTW1 9 /* available for programmer */ #define _PAGE_BIT_SOFTW2 10 /* " */ #define _PAGE_BIT_SOFTW3 11 /* " */ #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ #define _PAGE_BIT_SOFTW4 57 /* available for programmer */ #define _PAGE_BIT_SOFTW5 58 /* available for programmer */ #define _PAGE_BIT_PKEY_BIT0 59 /* Protection Keys, bit 1/4 */ #define _PAGE_BIT_PKEY_BIT1 60 /* Protection Keys, bit 2/4 */ #define _PAGE_BIT_PKEY_BIT2 61 /* Protection Keys, bit 3/4 */ #define _PAGE_BIT_PKEY_BIT3 62 /* Protection Keys, bit 4/4 */ #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ #define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1 #define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1 #define _PAGE_BIT_UFFD_WP _PAGE_BIT_SOFTW2 /* userfaultfd wrprotected */ #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ #define _PAGE_BIT_KERNEL_4K _PAGE_BIT_SOFTW3 /* page must not be converted to large */ #define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4 #ifdef CONFIG_X86_64 #define _PAGE_BIT_SAVED_DIRTY _PAGE_BIT_SOFTW5 /* Saved Dirty bit (leaf) */ #define _PAGE_BIT_NOPTISHADOW _PAGE_BIT_SOFTW5 /* No PTI shadow (root PGD) */ #else /* Shared with _PAGE_BIT_UFFD_WP which is not supported on 32 bit */ #define _PAGE_BIT_SAVED_DIRTY _PAGE_BIT_SOFTW2 /* Saved Dirty bit (leaf) */ #define _PAGE_BIT_NOPTISHADOW _PAGE_BIT_SOFTW2 /* No PTI shadow (root PGD) */ #endif /* If _PAGE_BIT_PRESENT is clear, we use these: */ /* - if the user mapped it with PROT_NONE; pte_present gives true */ #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL #define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT) #define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW) #define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER) #define _PAGE_PWT (_AT(pteval_t, 1) << _PAGE_BIT_PWT) #define _PAGE_PCD (_AT(pteval_t, 1) << _PAGE_BIT_PCD) #define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED) #define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) #define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1) #define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2) #define _PAGE_SOFTW3 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW3) #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) #define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) #define _PAGE_KERNEL_4K (_AT(pteval_t, 1) << _PAGE_BIT_KERNEL_4K) #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS #define _PAGE_PKEY_BIT0 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT0) #define _PAGE_PKEY_BIT1 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT1) #define _PAGE_PKEY_BIT2 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT2) #define _PAGE_PKEY_BIT3 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT3) #else #define _PAGE_PKEY_BIT0 (_AT(pteval_t, 0)) #define _PAGE_PKEY_BIT1 (_AT(pteval_t, 0)) #define _PAGE_PKEY_BIT2 (_AT(pteval_t, 0)) #define _PAGE_PKEY_BIT3 (_AT(pteval_t, 0)) #endif #define _PAGE_PKEY_MASK (_PAGE_PKEY_BIT0 | \ _PAGE_PKEY_BIT1 | \ _PAGE_PKEY_BIT2 | \ _PAGE_PKEY_BIT3) #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) #define _PAGE_KNL_ERRATUM_MASK (_PAGE_DIRTY | _PAGE_ACCESSED) #else #define _PAGE_KNL_ERRATUM_MASK 0 #endif #ifdef CONFIG_MEM_SOFT_DIRTY #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY) #else #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0)) #endif /* * Tracking soft dirty bit when a page goes to a swap is tricky. * We need a bit which can be stored in pte _and_ not conflict * with swap entry format. On x86 bits 1-4 are *not* involved * into swap entry computation, but bit 7 is used for thp migration, * so we borrow bit 1 for soft dirty tracking. * * Please note that this bit must be treated as swap dirty page * mark if and only if the PTE/PMD has present bit clear! */ #ifdef CONFIG_MEM_SOFT_DIRTY #define _PAGE_SWP_SOFT_DIRTY _PAGE_RW #else #define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0)) #endif #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP #define _PAGE_UFFD_WP (_AT(pteval_t, 1) << _PAGE_BIT_UFFD_WP) #define _PAGE_SWP_UFFD_WP _PAGE_USER #else #define _PAGE_UFFD_WP (_AT(pteval_t, 0)) #define _PAGE_SWP_UFFD_WP (_AT(pteval_t, 0)) #endif #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) #define _PAGE_DEVMAP (_AT(u64, 1) << _PAGE_BIT_DEVMAP) #define _PAGE_SOFTW4 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW4) #else #define _PAGE_NX (_AT(pteval_t, 0)) #define _PAGE_DEVMAP (_AT(pteval_t, 0)) #define _PAGE_SOFTW4 (_AT(pteval_t, 0)) #endif /* * The hardware requires shadow stack to be Write=0,Dirty=1. However, * there are valid cases where the kernel might create read-only PTEs that * are dirty (e.g., fork(), mprotect(), uffd-wp(), soft-dirty tracking). In * this case, the _PAGE_SAVED_DIRTY bit is used instead of the HW-dirty bit, * to avoid creating a wrong "shadow stack" PTEs. Such PTEs have * (Write=0,SavedDirty=1,Dirty=0) set. */ #define _PAGE_SAVED_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SAVED_DIRTY) #define _PAGE_DIRTY_BITS (_PAGE_DIRTY | _PAGE_SAVED_DIRTY) #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) #define _PAGE_NOPTISHADOW (_AT(pteval_t, 1) << _PAGE_BIT_NOPTISHADOW) /* * Set of bits not changed in pte_modify. The pte's * protection key is treated like _PAGE_RW, for * instance, and is *not* included in this mask since * pte_modify() does modify it. */ #define _COMMON_PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ _PAGE_SPECIAL | _PAGE_ACCESSED | \ _PAGE_DIRTY_BITS | _PAGE_SOFT_DIRTY | \ _PAGE_DEVMAP | _PAGE_CC | _PAGE_UFFD_WP) #define _PAGE_CHG_MASK (_COMMON_PAGE_CHG_MASK | _PAGE_PAT) #define _HPAGE_CHG_MASK (_COMMON_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_PAT_LARGE) /* * The cache modes defined here are used to translate between pure SW usage * and the HW defined cache mode bits and/or PAT entries. * * The resulting bits for PWT, PCD and PAT should be chosen in a way * to have the WB mode at index 0 (all bits clear). This is the default * right now and likely would break too much if changed. */ #ifndef __ASSEMBLER__ enum page_cache_mode { _PAGE_CACHE_MODE_WB = 0, _PAGE_CACHE_MODE_WC = 1, _PAGE_CACHE_MODE_UC_MINUS = 2, _PAGE_CACHE_MODE_UC = 3, _PAGE_CACHE_MODE_WT = 4, _PAGE_CACHE_MODE_WP = 5, _PAGE_CACHE_MODE_NUM = 8 }; #endif #define _PAGE_CC (_AT(pteval_t, cc_get_mask())) #define _PAGE_ENC (_AT(pteval_t, sme_me_mask)) #define _PAGE_CACHE_MASK (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT) #define _PAGE_LARGE_CACHE_MASK (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT_LARGE) #define _PAGE_NOCACHE (cachemode2protval(_PAGE_CACHE_MODE_UC)) #define _PAGE_CACHE_WP (cachemode2protval(_PAGE_CACHE_MODE_WP)) #define __PP _PAGE_PRESENT #define __RW _PAGE_RW #define _USR _PAGE_USER #define ___A _PAGE_ACCESSED #define ___D _PAGE_DIRTY #define ___G _PAGE_GLOBAL #define __NX _PAGE_NX #define _ENC _PAGE_ENC #define __WP _PAGE_CACHE_WP #define __NC _PAGE_NOCACHE #define _PSE _PAGE_PSE #define pgprot_val(x) ((x).pgprot) #define __pgprot(x) ((pgprot_t) { (x) } ) #define __pg(x) __pgprot(x) #define PAGE_NONE __pg( 0| 0| 0|___A| 0| 0| 0|___G) #define PAGE_SHARED __pg(__PP|__RW|_USR|___A|__NX| 0| 0| 0) #define PAGE_SHARED_EXEC __pg(__PP|__RW|_USR|___A| 0| 0| 0| 0) #define PAGE_COPY_NOEXEC __pg(__PP| 0|_USR|___A|__NX| 0| 0| 0) #define PAGE_COPY_EXEC __pg(__PP| 0|_USR|___A| 0| 0| 0| 0) #define PAGE_COPY __pg(__PP| 0|_USR|___A|__NX| 0| 0| 0) #define PAGE_READONLY __pg(__PP| 0|_USR|___A|__NX| 0| 0| 0) #define PAGE_READONLY_EXEC __pg(__PP| 0|_USR|___A| 0| 0| 0| 0) #define __PAGE_KERNEL (__PP|__RW| 0|___A|__NX|___D| 0|___G) #define __PAGE_KERNEL_EXEC (__PP|__RW| 0|___A| 0|___D| 0|___G) /* * Page tables needs to have Write=1 in order for any lower PTEs to be * writable. This includes shadow stack memory (Write=0, Dirty=1) */ #define _KERNPG_TABLE_NOENC (__PP|__RW| 0|___A| 0|___D| 0| 0) #define _KERNPG_TABLE (__PP|__RW| 0|___A| 0|___D| 0| 0| _ENC) #define _PAGE_TABLE_NOENC (__PP|__RW|_USR|___A| 0|___D| 0| 0) #define _PAGE_TABLE (__PP|__RW|_USR|___A| 0|___D| 0| 0| _ENC) #define __PAGE_KERNEL_RO (__PP| 0| 0|___A|__NX| 0| 0|___G) #define __PAGE_KERNEL_ROX (__PP| 0| 0|___A| 0| 0| 0|___G) #define __PAGE_KERNEL (__PP|__RW| 0|___A|__NX|___D| 0|___G) #define __PAGE_KERNEL_EXEC (__PP|__RW| 0|___A| 0|___D| 0|___G) #define __PAGE_KERNEL_NOCACHE (__PP|__RW| 0|___A|__NX|___D| 0|___G| __NC) #define __PAGE_KERNEL_VVAR (__PP| 0|_USR|___A|__NX| 0| 0|___G) #define __PAGE_KERNEL_LARGE (__PP|__RW| 0|___A|__NX|___D|_PSE|___G) #define __PAGE_KERNEL_LARGE_EXEC (__PP|__RW| 0|___A| 0|___D|_PSE|___G) #define __PAGE_KERNEL_WP (__PP|__RW| 0|___A|__NX|___D| 0|___G| __WP) #define __PAGE_KERNEL_IO __PAGE_KERNEL #define __PAGE_KERNEL_IO_NOCACHE __PAGE_KERNEL_NOCACHE #ifndef __ASSEMBLER__ #define __PAGE_KERNEL_ENC (__PAGE_KERNEL | _ENC) #define __PAGE_KERNEL_ENC_WP (__PAGE_KERNEL_WP | _ENC) #define __PAGE_KERNEL_NOENC (__PAGE_KERNEL | 0) #define __PAGE_KERNEL_NOENC_WP (__PAGE_KERNEL_WP | 0) #define __pgprot_mask(x) __pgprot((x) & __default_kernel_pte_mask) #define PAGE_KERNEL __pgprot_mask(__PAGE_KERNEL | _ENC) #define PAGE_KERNEL_NOENC __pgprot_mask(__PAGE_KERNEL | 0) #define PAGE_KERNEL_RO __pgprot_mask(__PAGE_KERNEL_RO | _ENC) #define PAGE_KERNEL_EXEC __pgprot_mask(__PAGE_KERNEL_EXEC | _ENC) #define PAGE_KERNEL_EXEC_NOENC __pgprot_mask(__PAGE_KERNEL_EXEC | 0) #define PAGE_KERNEL_ROX __pgprot_mask(__PAGE_KERNEL_ROX | _ENC) #define PAGE_KERNEL_NOCACHE __pgprot_mask(__PAGE_KERNEL_NOCACHE | _ENC) #define PAGE_KERNEL_LARGE __pgprot_mask(__PAGE_KERNEL_LARGE | _ENC) #define PAGE_KERNEL_LARGE_EXEC __pgprot_mask(__PAGE_KERNEL_LARGE_EXEC | _ENC) #define PAGE_KERNEL_VVAR __pgprot_mask(__PAGE_KERNEL_VVAR | _ENC) #define PAGE_KERNEL_IO __pgprot_mask(__PAGE_KERNEL_IO) #define PAGE_KERNEL_IO_NOCACHE __pgprot_mask(__PAGE_KERNEL_IO_NOCACHE) #endif /* __ASSEMBLER__ */ /* * early identity mapping pte attrib macros. */ #ifdef CONFIG_X86_64 #define __PAGE_KERNEL_IDENT_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC #else #define PTE_IDENT_ATTR 0x003 /* PRESENT+RW */ #define PDE_IDENT_ATTR 0x063 /* PRESENT+RW+DIRTY+ACCESSED */ #define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */ #endif #ifdef CONFIG_X86_32 # include <asm/pgtable_32_types.h> #else # include <asm/pgtable_64_types.h> #endif #ifndef __ASSEMBLER__ #include <linux/types.h> /* Extracts the PFN from a (pte|pmd|pud|pgd)val_t of a 4KB page */ #define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK) /* * Extracts the flags from a (pte|pmd|pud|pgd)val_t * This includes the protection key value. */ #define PTE_FLAGS_MASK (~PTE_PFN_MASK) typedef struct pgprot { pgprotval_t pgprot; } pgprot_t; typedef struct { pgdval_t pgd; } pgd_t; static inline pgprot_t pgprot_nx(pgprot_t prot) { return __pgprot(pgprot_val(prot) | _PAGE_NX); } #define pgprot_nx pgprot_nx #ifdef CONFIG_X86_PAE /* * PHYSICAL_PAGE_MASK might be non-constant when SME is compiled in, so we can't * use it here. */ #define PGD_PAE_PAGE_MASK ((signed long)PAGE_MASK) #define PGD_PAE_PHYS_MASK (((1ULL << __PHYSICAL_MASK_SHIFT)-1) & PGD_PAE_PAGE_MASK) /* * PAE allows Base Address, P, PWT, PCD and AVL bits to be set in PGD entries. * All other bits are Reserved MBZ */ #define PGD_ALLOWED_BITS (PGD_PAE_PHYS_MASK | _PAGE_PRESENT | \ _PAGE_PWT | _PAGE_PCD | \ _PAGE_SOFTW1 | _PAGE_SOFTW2 | _PAGE_SOFTW3) #else /* No need to mask any bits for !PAE */ #define PGD_ALLOWED_BITS (~0ULL) #endif static inline pgd_t native_make_pgd(pgdval_t val) { return (pgd_t) { val & PGD_ALLOWED_BITS }; } static inline pgdval_t native_pgd_val(pgd_t pgd) { return pgd.pgd & PGD_ALLOWED_BITS; } static inline pgdval_t pgd_flags(pgd_t pgd) { return native_pgd_val(pgd) & PTE_FLAGS_MASK; } #if CONFIG_PGTABLE_LEVELS > 4 typedef struct { p4dval_t p4d; } p4d_t; static inline p4d_t native_make_p4d(pudval_t val) { return (p4d_t) { val }; } static inline p4dval_t native_p4d_val(p4d_t p4d) { return p4d.p4d; } #else #include <asm-generic/pgtable-nop4d.h> static inline p4d_t native_make_p4d(pudval_t val) { return (p4d_t) { .pgd = native_make_pgd((pgdval_t)val) }; } static inline p4dval_t native_p4d_val(p4d_t p4d) { return native_pgd_val(p4d.pgd); } #endif #if CONFIG_PGTABLE_LEVELS > 3 typedef struct { pudval_t pud; } pud_t; static inline pud_t native_make_pud(pmdval_t val) { return (pud_t) { val }; } static inline pudval_t native_pud_val(pud_t pud) { return pud.pud; } #else #include <asm-generic/pgtable-nopud.h> static inline pud_t native_make_pud(pudval_t val) { return (pud_t) { .p4d.pgd = native_make_pgd(val) }; } static inline pudval_t native_pud_val(pud_t pud) { return native_pgd_val(pud.p4d.pgd); } #endif #if CONFIG_PGTABLE_LEVELS > 2 static inline pmd_t native_make_pmd(pmdval_t val) { return (pmd_t) { .pmd = val }; } static inline pmdval_t native_pmd_val(pmd_t pmd) { return pmd.pmd; } #else #include <asm-generic/pgtable-nopmd.h> static inline pmd_t native_make_pmd(pmdval_t val) { return (pmd_t) { .pud.p4d.pgd = native_make_pgd(val) }; } static inline pmdval_t native_pmd_val(pmd_t pmd) { return native_pgd_val(pmd.pud.p4d.pgd); } #endif static inline p4dval_t p4d_pfn_mask(p4d_t p4d) { /* No 512 GiB huge pages yet */ return PTE_PFN_MASK; } static inline p4dval_t p4d_flags_mask(p4d_t p4d) { return ~p4d_pfn_mask(p4d); } static inline p4dval_t p4d_flags(p4d_t p4d) { return native_p4d_val(p4d) & p4d_flags_mask(p4d); } static inline pudval_t pud_pfn_mask(pud_t pud) { if (native_pud_val(pud) & _PAGE_PSE) return PHYSICAL_PUD_PAGE_MASK; else return PTE_PFN_MASK; } static inline pudval_t pud_flags_mask(pud_t pud) { return ~pud_pfn_mask(pud); } static inline pudval_t pud_flags(pud_t pud) { return native_pud_val(pud) & pud_flags_mask(pud); } static inline pmdval_t pmd_pfn_mask(pmd_t pmd) { if (native_pmd_val(pmd) & _PAGE_PSE) return PHYSICAL_PMD_PAGE_MASK; else return PTE_PFN_MASK; } static inline pmdval_t pmd_flags_mask(pmd_t pmd) { return ~pmd_pfn_mask(pmd); } static inline pmdval_t pmd_flags(pmd_t pmd) { return native_pmd_val(pmd) & pmd_flags_mask(pmd); } static inline pte_t native_make_pte(pteval_t val) { return (pte_t) { .pte = val }; } static inline pteval_t native_pte_val(pte_t pte) { return pte.pte; } static inline pteval_t pte_flags(pte_t pte) { return native_pte_val(pte) & PTE_FLAGS_MASK; } #define __pte2cm_idx(cb) \ ((((cb) >> (_PAGE_BIT_PAT - 2)) & 4) | \ (((cb) >> (_PAGE_BIT_PCD - 1)) & 2) | \ (((cb) >> _PAGE_BIT_PWT) & 1)) #define __cm_idx2pte(i) \ ((((i) & 4) << (_PAGE_BIT_PAT - 2)) | \ (((i) & 2) << (_PAGE_BIT_PCD - 1)) | \ (((i) & 1) << _PAGE_BIT_PWT)) unsigned long cachemode2protval(enum page_cache_mode pcm); static inline pgprotval_t protval_4k_2_large(pgprotval_t val) { return (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) | ((val & _PAGE_PAT) << (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT)); } static inline pgprot_t pgprot_4k_2_large(pgprot_t pgprot) { return __pgprot(protval_4k_2_large(pgprot_val(pgprot))); } static inline pgprotval_t protval_large_2_4k(pgprotval_t val) { return (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) | ((val & _PAGE_PAT_LARGE) >> (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT)); } static inline pgprot_t pgprot_large_2_4k(pgprot_t pgprot) { return __pgprot(protval_large_2_4k(pgprot_val(pgprot))); } typedef struct page *pgtable_t; extern pteval_t __supported_pte_mask; extern pteval_t __default_kernel_pte_mask; #define pgprot_writecombine pgprot_writecombine extern pgprot_t pgprot_writecombine(pgprot_t prot); #define pgprot_writethrough pgprot_writethrough extern pgprot_t pgprot_writethrough(pgprot_t prot); /* Indicate that x86 has its own track and untrack pfn vma functions */ #define __HAVE_PFNMAP_TRACKING #define __HAVE_PHYS_MEM_ACCESS_PROT struct file; pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot); /* Install a pte for a particular vaddr in kernel space. */ void set_pte_vaddr(unsigned long vaddr, pte_t pte); #ifdef CONFIG_X86_32 extern void native_pagetable_init(void); #else #define native_pagetable_init paging_init #endif enum pg_level { PG_LEVEL_NONE, PG_LEVEL_4K, PG_LEVEL_2M, PG_LEVEL_1G, PG_LEVEL_512G, PG_LEVEL_256T, PG_LEVEL_NUM }; #ifdef CONFIG_PROC_FS extern void update_page_count(int level, unsigned long pages); #else static inline void update_page_count(int level, unsigned long pages) { } #endif /* * Helper function that returns the kernel pagetable entry controlling * the virtual address 'address'. NULL means no pagetable entry present. * NOTE: the return type is pte_t but if the pmd is PSE then we return it * as a pte too. */ extern pte_t *lookup_address(unsigned long address, unsigned int *level); extern pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, unsigned int *level); pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address, unsigned int *level, bool *nx, bool *rw); extern pmd_t *lookup_pmd_address(unsigned long address); extern phys_addr_t slow_virt_to_phys(void *__address); extern int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, unsigned numpages, unsigned long page_flags); extern int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address, unsigned long numpages); #endif /* !__ASSEMBLER__ */ #endif /* _ASM_X86_PGTABLE_DEFS_H */
8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 // SPDX-License-Identifier: GPL-2.0-only #include <linux/init.h> #include <linux/scatterlist.h> #include <linux/mempool.h> #include <linux/slab.h> #define SG_MEMPOOL_NR ARRAY_SIZE(sg_pools) #define SG_MEMPOOL_SIZE 2 struct sg_pool { size_t size; char *name; struct kmem_cache *slab; mempool_t *pool; }; #define SP(x) { .size = x, "sgpool-" __stringify(x) } #if (SG_CHUNK_SIZE < 32) #error SG_CHUNK_SIZE is too small (must be 32 or greater) #endif static struct sg_pool sg_pools[] = { SP(8), SP(16), #if (SG_CHUNK_SIZE > 32) SP(32), #if (SG_CHUNK_SIZE > 64) SP(64), #if (SG_CHUNK_SIZE > 128) SP(128), #if (SG_CHUNK_SIZE > 256) #error SG_CHUNK_SIZE is too large (256 MAX) #endif #endif #endif #endif SP(SG_CHUNK_SIZE) }; #undef SP static inline unsigned int sg_pool_index(unsigned short nents) { unsigned int index; BUG_ON(nents > SG_CHUNK_SIZE); if (nents <= 8) index = 0; else index = get_count_order(nents) - 3; return index; } static void sg_pool_free(struct scatterlist *sgl, unsigned int nents) { struct sg_pool *sgp; sgp = sg_pools + sg_pool_index(nents); mempool_free(sgl, sgp->pool); } static struct scatterlist *sg_pool_alloc(unsigned int nents, gfp_t gfp_mask) { struct sg_pool *sgp; sgp = sg_pools + sg_pool_index(nents); return mempool_alloc(sgp->pool, gfp_mask); } /** * sg_free_table_chained - Free a previously mapped sg table * @table: The sg table header to use * @nents_first_chunk: size of the first_chunk SGL passed to * sg_alloc_table_chained * * Description: * Free an sg table previously allocated and setup with * sg_alloc_table_chained(). * * @nents_first_chunk has to be same with that same parameter passed * to sg_alloc_table_chained(). * **/ void sg_free_table_chained(struct sg_table *table, unsigned nents_first_chunk) { if (table->orig_nents <= nents_first_chunk) return; if (nents_first_chunk == 1) nents_first_chunk = 0; __sg_free_table(table, SG_CHUNK_SIZE, nents_first_chunk, sg_pool_free, table->orig_nents); } EXPORT_SYMBOL_GPL(sg_free_table_chained); /** * sg_alloc_table_chained - Allocate and chain SGLs in an sg table * @table: The sg table header to use * @nents: Number of entries in sg list * @first_chunk: first SGL * @nents_first_chunk: number of the SGL of @first_chunk * * Description: * Allocate and chain SGLs in an sg table. If @nents@ is larger than * @nents_first_chunk a chained sg table will be setup. @first_chunk is * ignored if nents_first_chunk <= 1 because user expects the SGL points * non-chain SGL. * **/ int sg_alloc_table_chained(struct sg_table *table, int nents, struct scatterlist *first_chunk, unsigned nents_first_chunk) { int ret; BUG_ON(!nents); if (first_chunk && nents_first_chunk) { if (nents <= nents_first_chunk) { table->nents = table->orig_nents = nents; sg_init_table(table->sgl, nents); return 0; } } /* User supposes that the 1st SGL includes real entry */ if (nents_first_chunk <= 1) { first_chunk = NULL; nents_first_chunk = 0; } ret = __sg_alloc_table(table, nents, SG_CHUNK_SIZE, first_chunk, nents_first_chunk, GFP_ATOMIC, sg_pool_alloc); if (unlikely(ret)) sg_free_table_chained(table, nents_first_chunk); return ret; } EXPORT_SYMBOL_GPL(sg_alloc_table_chained); static __init int sg_pool_init(void) { int i; for (i = 0; i < SG_MEMPOOL_NR; i++) { struct sg_pool *sgp = sg_pools + i; int size = sgp->size * sizeof(struct scatterlist); sgp->slab = kmem_cache_create(sgp->name, size, 0, SLAB_HWCACHE_ALIGN, NULL); if (!sgp->slab) { printk(KERN_ERR "SG_POOL: can't init sg slab %s\n", sgp->name); goto cleanup_sdb; } sgp->pool = mempool_create_slab_pool(SG_MEMPOOL_SIZE, sgp->slab); if (!sgp->pool) { printk(KERN_ERR "SG_POOL: can't init sg mempool %s\n", sgp->name); goto cleanup_sdb; } } return 0; cleanup_sdb: for (i = 0; i < SG_MEMPOOL_NR; i++) { struct sg_pool *sgp = sg_pools + i; mempool_destroy(sgp->pool); kmem_cache_destroy(sgp->slab); } return -ENOMEM; } subsys_initcall(sg_pool_init);
5 2 2 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 /* * xt_time * Copyright © CC Computer Consultants GmbH, 2007 * * based on ipt_time by Fabrice MARIE <fabrice@netfilter.org> * This is a module which is used for time matching * It is using some modified code from dietlibc (localtime() function) * that you can find at https://www.fefe.de/dietlibc/ * This file is distributed under the terms of the GNU General Public * License (GPL). Copies of the GPL can be obtained from gnu.org/gpl. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/ktime.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/types.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_time.h> struct xtm { u_int8_t month; /* (1-12) */ u_int8_t monthday; /* (1-31) */ u_int8_t weekday; /* (1-7) */ u_int8_t hour; /* (0-23) */ u_int8_t minute; /* (0-59) */ u_int8_t second; /* (0-59) */ unsigned int dse; }; extern struct timezone sys_tz; /* ouch */ static const u_int16_t days_since_year[] = { 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, }; static const u_int16_t days_since_leapyear[] = { 0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, }; /* * Since time progresses forward, it is best to organize this array in reverse, * to minimize lookup time. */ enum { DSE_FIRST = 2039, SECONDS_PER_DAY = 86400, }; static const u_int16_t days_since_epoch[] = { /* 2039 - 2030 */ 25202, 24837, 24472, 24106, 23741, 23376, 23011, 22645, 22280, 21915, /* 2029 - 2020 */ 21550, 21184, 20819, 20454, 20089, 19723, 19358, 18993, 18628, 18262, /* 2019 - 2010 */ 17897, 17532, 17167, 16801, 16436, 16071, 15706, 15340, 14975, 14610, /* 2009 - 2000 */ 14245, 13879, 13514, 13149, 12784, 12418, 12053, 11688, 11323, 10957, /* 1999 - 1990 */ 10592, 10227, 9862, 9496, 9131, 8766, 8401, 8035, 7670, 7305, /* 1989 - 1980 */ 6940, 6574, 6209, 5844, 5479, 5113, 4748, 4383, 4018, 3652, /* 1979 - 1970 */ 3287, 2922, 2557, 2191, 1826, 1461, 1096, 730, 365, 0, }; static inline bool is_leap(unsigned int y) { return y % 4 == 0 && (y % 100 != 0 || y % 400 == 0); } /* * Each network packet has a (nano)seconds-since-the-epoch (SSTE) timestamp. * Since we match against days and daytime, the SSTE value needs to be * computed back into human-readable dates. * * This is done in three separate functions so that the most expensive * calculations are done last, in case a "simple match" can be found earlier. */ static inline unsigned int localtime_1(struct xtm *r, time64_t time) { unsigned int v, w; /* Each day has 86400s, so finding the hour/minute is actually easy. */ div_u64_rem(time, SECONDS_PER_DAY, &v); r->second = v % 60; w = v / 60; r->minute = w % 60; r->hour = w / 60; return v; } static inline void localtime_2(struct xtm *r, time64_t time) { /* * Here comes the rest (weekday, monthday). First, divide the SSTE * by seconds-per-day to get the number of _days_ since the epoch. */ r->dse = div_u64(time, SECONDS_PER_DAY); /* * 1970-01-01 (w=0) was a Thursday (4). * -1 and +1 map Sunday properly onto 7. */ r->weekday = (4 + r->dse - 1) % 7 + 1; } static void localtime_3(struct xtm *r, time64_t time) { unsigned int year, i, w = r->dse; /* * In each year, a certain number of days-since-the-epoch have passed. * Find the year that is closest to said days. * * Consider, for example, w=21612 (2029-03-04). Loop will abort on * dse[i] <= w, which happens when dse[i] == 21550. This implies * year == 2009. w will then be 62. */ for (i = 0, year = DSE_FIRST; days_since_epoch[i] > w; ++i, --year) /* just loop */; w -= days_since_epoch[i]; /* * By now we have the current year, and the day of the year. * r->yearday = w; * * On to finding the month (like above). In each month, a certain * number of days-since-New Year have passed, and find the closest * one. * * Consider w=62 (in a non-leap year). Loop will abort on * dsy[i] < w, which happens when dsy[i] == 31+28 (i == 2). * Concludes i == 2, i.e. 3rd month => March. * * (A different approach to use would be to subtract a monthlength * from w repeatedly while counting.) */ if (is_leap(year)) { /* use days_since_leapyear[] in a leap year */ for (i = ARRAY_SIZE(days_since_leapyear) - 1; i > 0 && days_since_leapyear[i] > w; --i) /* just loop */; r->monthday = w - days_since_leapyear[i] + 1; } else { for (i = ARRAY_SIZE(days_since_year) - 1; i > 0 && days_since_year[i] > w; --i) /* just loop */; r->monthday = w - days_since_year[i] + 1; } r->month = i + 1; } static bool time_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_time_info *info = par->matchinfo; unsigned int packet_time; struct xtm current_time; time64_t stamp; /* * We need real time here, but we can neither use skb->tstamp * nor __net_timestamp(). * * skb->tstamp and skb->skb_mstamp_ns overlap, however, they * use different clock types (real vs monotonic). * * Suppose you have two rules: * 1. match before 13:00 * 2. match after 13:00 * * If you match against processing time (ktime_get_real_seconds) it * may happen that the same packet matches both rules if * it arrived at the right moment before 13:00, so it would be * better to check skb->tstamp and set it via __net_timestamp() * if needed. This however breaks outgoing packets tx timestamp, * and causes them to get delayed forever by fq packet scheduler. */ stamp = ktime_get_real_seconds(); if (info->flags & XT_TIME_LOCAL_TZ) /* Adjust for local timezone */ stamp -= 60 * sys_tz.tz_minuteswest; /* * xt_time will match when _all_ of the following hold: * - 'now' is in the global time range date_start..date_end * - 'now' is in the monthday mask * - 'now' is in the weekday mask * - 'now' is in the daytime range time_start..time_end * (and by default, libxt_time will set these so as to match) * * note: info->date_start/stop are unsigned 32-bit values that * can hold values beyond y2038, but not after y2106. */ if (stamp < info->date_start || stamp > info->date_stop) return false; packet_time = localtime_1(&current_time, stamp); if (info->daytime_start < info->daytime_stop) { if (packet_time < info->daytime_start || packet_time > info->daytime_stop) return false; } else { if (packet_time < info->daytime_start && packet_time > info->daytime_stop) return false; /** if user asked to ignore 'next day', then e.g. * '1 PM Wed, August 1st' should be treated * like 'Tue 1 PM July 31st'. * * This also causes * 'Monday, "23:00 to 01:00", to match for 2 hours, starting * Monday 23:00 to Tuesday 01:00. */ if ((info->flags & XT_TIME_CONTIGUOUS) && packet_time <= info->daytime_stop) stamp -= SECONDS_PER_DAY; } localtime_2(&current_time, stamp); if (!(info->weekdays_match & (1 << current_time.weekday))) return false; /* Do not spend time computing monthday if all days match anyway */ if (info->monthdays_match != XT_TIME_ALL_MONTHDAYS) { localtime_3(&current_time, stamp); if (!(info->monthdays_match & (1 << current_time.monthday))) return false; } return true; } static int time_mt_check(const struct xt_mtchk_param *par) { const struct xt_time_info *info = par->matchinfo; if (info->daytime_start > XT_TIME_MAX_DAYTIME || info->daytime_stop > XT_TIME_MAX_DAYTIME) { pr_info_ratelimited("invalid argument - start or stop time greater than 23:59:59\n"); return -EDOM; } if (info->flags & ~XT_TIME_ALL_FLAGS) { pr_info_ratelimited("unknown flags 0x%x\n", info->flags & ~XT_TIME_ALL_FLAGS); return -EINVAL; } if ((info->flags & XT_TIME_CONTIGUOUS) && info->daytime_start < info->daytime_stop) return -EINVAL; return 0; } static struct xt_match xt_time_mt_reg __read_mostly = { .name = "time", .family = NFPROTO_UNSPEC, .match = time_mt, .checkentry = time_mt_check, .matchsize = sizeof(struct xt_time_info), .me = THIS_MODULE, }; static int __init time_mt_init(void) { int minutes = sys_tz.tz_minuteswest; if (minutes < 0) /* east of Greenwich */ pr_info("kernel timezone is +%02d%02d\n", -minutes / 60, -minutes % 60); else /* west of Greenwich */ pr_info("kernel timezone is -%02d%02d\n", minutes / 60, minutes % 60); return xt_register_match(&xt_time_mt_reg); } static void __exit time_mt_exit(void) { xt_unregister_match(&xt_time_mt_reg); } module_init(time_mt_init); module_exit(time_mt_exit); MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); MODULE_DESCRIPTION("Xtables: time-based matching"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_time"); MODULE_ALIAS("ip6t_time");
38 19 22 12 9 5 8 8 8 8 8 2 7 52 14 20 12 8 8 7 8 20 35 27 8 4 8 7 5 2 24 13 11 11 11 39 39 1 40 6 328 329 328 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 // SPDX-License-Identifier: GPL-2.0-or-later /* * NET3: Garbage Collector For AF_UNIX sockets * * Garbage Collector: * Copyright (C) Barak A. Pearlmutter. * * Chopped about by Alan Cox 22/3/96 to make it fit the AF_UNIX socket problem. * If it doesn't work blame me, it worked when Barak sent it. * * Assumptions: * * - object w/ a bit * - free list * * Current optimizations: * * - explicit stack instead of recursion * - tail recurse on first born instead of immediate push/pop * - we gather the stuff that should not be killed into tree * and stack is just a path from root to the current pointer. * * Future optimizations: * * - don't just push entire root set; process in place * * Fixes: * Alan Cox 07 Sept 1997 Vmalloc internal stack as needed. * Cope with changing max_files. * Al Viro 11 Oct 1998 * Graph may have cycles. That is, we can send the descriptor * of foo to bar and vice versa. Current code chokes on that. * Fix: move SCM_RIGHTS ones into the separate list and then * skb_free() them all instead of doing explicit fput's. * Another problem: since fput() may block somebody may * create a new unix_socket when we are in the middle of sweep * phase. Fix: revert the logic wrt MARKED. Mark everything * upon the beginning and unmark non-junk ones. * * [12 Oct 1998] AAARGH! New code purges all SCM_RIGHTS * sent to connect()'ed but still not accept()'ed sockets. * Fixed. Old code had slightly different problem here: * extra fput() in situation when we passed the descriptor via * such socket and closed it (descriptor). That would happen on * each unix_gc() until the accept(). Since the struct file in * question would go to the free list and might be reused... * That might be the reason of random oopses on filp_close() * in unrelated processes. * * AV 28 Feb 1999 * Kill the explicit allocation of stack. Now we keep the tree * with root in dummy + pointer (gc_current) to one of the nodes. * Stack is represented as path from gc_current to dummy. Unmark * now means "add to tree". Push == "make it a son of gc_current". * Pop == "move gc_current to parent". We keep only pointers to * parents (->gc_tree). * AV 1 Mar 1999 * Damn. Added missing check for ->dead in listen queues scanning. * * Miklos Szeredi 25 Jun 2007 * Reimplement with a cycle collecting algorithm. This should * solve several problems with the previous code, like being racy * wrt receive and holding up unrelated socket operations. */ #include <linux/fs.h> #include <linux/list.h> #include <linux/skbuff.h> #include <linux/socket.h> #include <linux/workqueue.h> #include <net/af_unix.h> #include <net/scm.h> #include <net/tcp_states.h> #include "af_unix.h" struct unix_vertex { struct list_head edges; struct list_head entry; struct list_head scc_entry; unsigned long out_degree; unsigned long index; unsigned long scc_index; }; struct unix_edge { struct unix_sock *predecessor; struct unix_sock *successor; struct list_head vertex_entry; struct list_head stack_entry; }; struct unix_sock *unix_get_socket(struct file *filp) { struct inode *inode = file_inode(filp); /* Socket ? */ if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) { struct socket *sock = SOCKET_I(inode); const struct proto_ops *ops; struct sock *sk = sock->sk; ops = READ_ONCE(sock->ops); /* PF_UNIX ? */ if (sk && ops && ops->family == PF_UNIX) return unix_sk(sk); } return NULL; } static struct unix_vertex *unix_edge_successor(struct unix_edge *edge) { /* If an embryo socket has a fd, * the listener indirectly holds the fd's refcnt. */ if (edge->successor->listener) return unix_sk(edge->successor->listener)->vertex; return edge->successor->vertex; } static bool unix_graph_maybe_cyclic; static bool unix_graph_grouped; static void unix_update_graph(struct unix_vertex *vertex) { /* If the receiver socket is not inflight, no cyclic * reference could be formed. */ if (!vertex) return; unix_graph_maybe_cyclic = true; unix_graph_grouped = false; } static LIST_HEAD(unix_unvisited_vertices); enum unix_vertex_index { UNIX_VERTEX_INDEX_MARK1, UNIX_VERTEX_INDEX_MARK2, UNIX_VERTEX_INDEX_START, }; static unsigned long unix_vertex_unvisited_index = UNIX_VERTEX_INDEX_MARK1; static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge) { struct unix_vertex *vertex = edge->predecessor->vertex; if (!vertex) { vertex = list_first_entry(&fpl->vertices, typeof(*vertex), entry); vertex->index = unix_vertex_unvisited_index; vertex->out_degree = 0; INIT_LIST_HEAD(&vertex->edges); INIT_LIST_HEAD(&vertex->scc_entry); list_move_tail(&vertex->entry, &unix_unvisited_vertices); edge->predecessor->vertex = vertex; } vertex->out_degree++; list_add_tail(&edge->vertex_entry, &vertex->edges); unix_update_graph(unix_edge_successor(edge)); } static void unix_del_edge(struct scm_fp_list *fpl, struct unix_edge *edge) { struct unix_vertex *vertex = edge->predecessor->vertex; if (!fpl->dead) unix_update_graph(unix_edge_successor(edge)); list_del(&edge->vertex_entry); vertex->out_degree--; if (!vertex->out_degree) { edge->predecessor->vertex = NULL; list_move_tail(&vertex->entry, &fpl->vertices); } } static void unix_free_vertices(struct scm_fp_list *fpl) { struct unix_vertex *vertex, *next_vertex; list_for_each_entry_safe(vertex, next_vertex, &fpl->vertices, entry) { list_del(&vertex->entry); kfree(vertex); } } static DEFINE_SPINLOCK(unix_gc_lock); unsigned int unix_tot_inflight; void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver) { int i = 0, j = 0; spin_lock(&unix_gc_lock); if (!fpl->count_unix) goto out; do { struct unix_sock *inflight = unix_get_socket(fpl->fp[j++]); struct unix_edge *edge; if (!inflight) continue; edge = fpl->edges + i++; edge->predecessor = inflight; edge->successor = receiver; unix_add_edge(fpl, edge); } while (i < fpl->count_unix); receiver->scm_stat.nr_unix_fds += fpl->count_unix; WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + fpl->count_unix); out: WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight + fpl->count); spin_unlock(&unix_gc_lock); fpl->inflight = true; unix_free_vertices(fpl); } void unix_del_edges(struct scm_fp_list *fpl) { struct unix_sock *receiver; int i = 0; spin_lock(&unix_gc_lock); if (!fpl->count_unix) goto out; do { struct unix_edge *edge = fpl->edges + i++; unix_del_edge(fpl, edge); } while (i < fpl->count_unix); if (!fpl->dead) { receiver = fpl->edges[0].successor; receiver->scm_stat.nr_unix_fds -= fpl->count_unix; } WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - fpl->count_unix); out: WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight - fpl->count); spin_unlock(&unix_gc_lock); fpl->inflight = false; } void unix_update_edges(struct unix_sock *receiver) { /* nr_unix_fds is only updated under unix_state_lock(). * If it's 0 here, the embryo socket is not part of the * inflight graph, and GC will not see it, so no lock needed. */ if (!receiver->scm_stat.nr_unix_fds) { receiver->listener = NULL; } else { spin_lock(&unix_gc_lock); unix_update_graph(unix_sk(receiver->listener)->vertex); receiver->listener = NULL; spin_unlock(&unix_gc_lock); } } int unix_prepare_fpl(struct scm_fp_list *fpl) { struct unix_vertex *vertex; int i; if (!fpl->count_unix) return 0; for (i = 0; i < fpl->count_unix; i++) { vertex = kmalloc(sizeof(*vertex), GFP_KERNEL); if (!vertex) goto err; list_add(&vertex->entry, &fpl->vertices); } fpl->edges = kvmalloc_array(fpl->count_unix, sizeof(*fpl->edges), GFP_KERNEL_ACCOUNT); if (!fpl->edges) goto err; return 0; err: unix_free_vertices(fpl); return -ENOMEM; } void unix_destroy_fpl(struct scm_fp_list *fpl) { if (fpl->inflight) unix_del_edges(fpl); kvfree(fpl->edges); unix_free_vertices(fpl); } static bool unix_vertex_dead(struct unix_vertex *vertex) { struct unix_edge *edge; struct unix_sock *u; long total_ref; list_for_each_entry(edge, &vertex->edges, vertex_entry) { struct unix_vertex *next_vertex = unix_edge_successor(edge); /* The vertex's fd can be received by a non-inflight socket. */ if (!next_vertex) return false; /* The vertex's fd can be received by an inflight socket in * another SCC. */ if (next_vertex->scc_index != vertex->scc_index) return false; } /* No receiver exists out of the same SCC. */ edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry); u = edge->predecessor; total_ref = file_count(u->sk.sk_socket->file); /* If not close()d, total_ref > out_degree. */ if (total_ref != vertex->out_degree) return false; return true; } static void unix_collect_skb(struct list_head *scc, struct sk_buff_head *hitlist) { struct unix_vertex *vertex; list_for_each_entry_reverse(vertex, scc, scc_entry) { struct sk_buff_head *queue; struct unix_edge *edge; struct unix_sock *u; edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry); u = edge->predecessor; queue = &u->sk.sk_receive_queue; spin_lock(&queue->lock); if (u->sk.sk_state == TCP_LISTEN) { struct sk_buff *skb; skb_queue_walk(queue, skb) { struct sk_buff_head *embryo_queue = &skb->sk->sk_receive_queue; spin_lock(&embryo_queue->lock); skb_queue_splice_init(embryo_queue, hitlist); spin_unlock(&embryo_queue->lock); } } else { skb_queue_splice_init(queue, hitlist); } spin_unlock(&queue->lock); } } static bool unix_scc_cyclic(struct list_head *scc) { struct unix_vertex *vertex; struct unix_edge *edge; /* SCC containing multiple vertices ? */ if (!list_is_singular(scc)) return true; vertex = list_first_entry(scc, typeof(*vertex), scc_entry); /* Self-reference or a embryo-listener circle ? */ list_for_each_entry(edge, &vertex->edges, vertex_entry) { if (unix_edge_successor(edge) == vertex) return true; } return false; } static LIST_HEAD(unix_visited_vertices); static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2; static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_index, struct sk_buff_head *hitlist) { LIST_HEAD(vertex_stack); struct unix_edge *edge; LIST_HEAD(edge_stack); next_vertex: /* Push vertex to vertex_stack and mark it as on-stack * (index >= UNIX_VERTEX_INDEX_START). * The vertex will be popped when finalising SCC later. */ list_add(&vertex->scc_entry, &vertex_stack); vertex->index = *last_index; vertex->scc_index = *last_index; (*last_index)++; /* Explore neighbour vertices (receivers of the current vertex's fd). */ list_for_each_entry(edge, &vertex->edges, vertex_entry) { struct unix_vertex *next_vertex = unix_edge_successor(edge); if (!next_vertex) continue; if (next_vertex->index == unix_vertex_unvisited_index) { /* Iterative deepening depth first search * * 1. Push a forward edge to edge_stack and set * the successor to vertex for the next iteration. */ list_add(&edge->stack_entry, &edge_stack); vertex = next_vertex; goto next_vertex; /* 2. Pop the edge directed to the current vertex * and restore the ancestor for backtracking. */ prev_vertex: edge = list_first_entry(&edge_stack, typeof(*edge), stack_entry); list_del_init(&edge->stack_entry); next_vertex = vertex; vertex = edge->predecessor->vertex; /* If the successor has a smaller scc_index, two vertices * are in the same SCC, so propagate the smaller scc_index * to skip SCC finalisation. */ vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index); } else if (next_vertex->index != unix_vertex_grouped_index) { /* Loop detected by a back/cross edge. * * The successor is on vertex_stack, so two vertices are in * the same SCC. If the successor has a smaller *scc_index*, * propagate it to skip SCC finalisation. */ vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index); } else { /* The successor was already grouped as another SCC */ } } if (vertex->index == vertex->scc_index) { struct unix_vertex *v; struct list_head scc; bool scc_dead = true; /* SCC finalised. * * If the scc_index was not updated, all the vertices above on * vertex_stack are in the same SCC. Group them using scc_entry. */ __list_cut_position(&scc, &vertex_stack, &vertex->scc_entry); list_for_each_entry_reverse(v, &scc, scc_entry) { /* Don't restart DFS from this vertex in unix_walk_scc(). */ list_move_tail(&v->entry, &unix_visited_vertices); /* Mark vertex as off-stack. */ v->index = unix_vertex_grouped_index; if (scc_dead) scc_dead = unix_vertex_dead(v); } if (scc_dead) unix_collect_skb(&scc, hitlist); else if (!unix_graph_maybe_cyclic) unix_graph_maybe_cyclic = unix_scc_cyclic(&scc); list_del(&scc); } /* Need backtracking ? */ if (!list_empty(&edge_stack)) goto prev_vertex; } static void unix_walk_scc(struct sk_buff_head *hitlist) { unsigned long last_index = UNIX_VERTEX_INDEX_START; unix_graph_maybe_cyclic = false; /* Visit every vertex exactly once. * __unix_walk_scc() moves visited vertices to unix_visited_vertices. */ while (!list_empty(&unix_unvisited_vertices)) { struct unix_vertex *vertex; vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); __unix_walk_scc(vertex, &last_index, hitlist); } list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); swap(unix_vertex_unvisited_index, unix_vertex_grouped_index); unix_graph_grouped = true; } static void unix_walk_scc_fast(struct sk_buff_head *hitlist) { unix_graph_maybe_cyclic = false; while (!list_empty(&unix_unvisited_vertices)) { struct unix_vertex *vertex; struct list_head scc; bool scc_dead = true; vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); list_add(&scc, &vertex->scc_entry); list_for_each_entry_reverse(vertex, &scc, scc_entry) { list_move_tail(&vertex->entry, &unix_visited_vertices); if (scc_dead) scc_dead = unix_vertex_dead(vertex); } if (scc_dead) unix_collect_skb(&scc, hitlist); else if (!unix_graph_maybe_cyclic) unix_graph_maybe_cyclic = unix_scc_cyclic(&scc); list_del(&scc); } list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); } static bool gc_in_progress; static void __unix_gc(struct work_struct *work) { struct sk_buff_head hitlist; struct sk_buff *skb; spin_lock(&unix_gc_lock); if (!unix_graph_maybe_cyclic) { spin_unlock(&unix_gc_lock); goto skip_gc; } __skb_queue_head_init(&hitlist); if (unix_graph_grouped) unix_walk_scc_fast(&hitlist); else unix_walk_scc(&hitlist); spin_unlock(&unix_gc_lock); skb_queue_walk(&hitlist, skb) { if (UNIXCB(skb).fp) UNIXCB(skb).fp->dead = true; } __skb_queue_purge_reason(&hitlist, SKB_DROP_REASON_SOCKET_CLOSE); skip_gc: WRITE_ONCE(gc_in_progress, false); } static DECLARE_WORK(unix_gc_work, __unix_gc); void unix_gc(void) { WRITE_ONCE(gc_in_progress, true); queue_work(system_unbound_wq, &unix_gc_work); } #define UNIX_INFLIGHT_TRIGGER_GC 16000 #define UNIX_INFLIGHT_SANE_USER (SCM_MAX_FD * 8) void wait_for_unix_gc(struct scm_fp_list *fpl) { /* If number of inflight sockets is insane, * force a garbage collect right now. * * Paired with the WRITE_ONCE() in unix_inflight(), * unix_notinflight(), and __unix_gc(). */ if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC && !READ_ONCE(gc_in_progress)) unix_gc(); /* Penalise users who want to send AF_UNIX sockets * but whose sockets have not been received yet. */ if (!fpl || !fpl->count_unix || READ_ONCE(fpl->user->unix_inflight) < UNIX_INFLIGHT_SANE_USER) return; if (READ_ONCE(gc_in_progress)) flush_work(&unix_gc_work); }
58 58 58 58 58 58 58 1 1 497 496 243 11 264 60 5 56 60 395 61 7 10 27 2 2 2 2 2 1 1 2 2 21 40 2 26 144 4 6 134 20 78 7 60 57 16 97 10 26 3 6 6 2 4 3 2 1 1 319 1183 496 1501 1180 498 495 496 703 137 756 237 12 12 11 12 9 3 12 12 1 86 86 5 2072 167 2064 2068 548 1868 7 2066 2066 2069 2 12 12 11 2058 2063 2069 544 43 756 573 278 742 663 68 28 73 906 726 321 884 25 83 896 70 182 13 22 166 178 175 45 163 3 16 11 15 6 1 3 2 4 9 10 13 13 5 21 2 6 40 7 2 4 4 4 2 1815 177 1013 1 169 23 64 165 1 9 58 34 50 18 22 10 319 414 27 10 7 68 1509 1430 162 188 2065 693 100 127 740 5 103 111 89 112 32 637 271 648 161 952 954 955 121 543 385 31 30 151 152 150 37 109 8 60 12 12 6 7 5 1 6 55 19 45 453 453 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 // SPDX-License-Identifier: GPL-2.0-only #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/export.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/if_vlan.h> #include <linux/filter.h> #include <net/dsa.h> #include <net/dst_metadata.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/gre.h> #include <net/pptp.h> #include <net/tipc.h> #include <linux/igmp.h> #include <linux/icmp.h> #include <linux/sctp.h> #include <linux/dccp.h> #include <linux/if_tunnel.h> #include <linux/if_pppox.h> #include <linux/ppp_defs.h> #include <linux/stddef.h> #include <linux/if_ether.h> #include <linux/if_hsr.h> #include <linux/mpls.h> #include <linux/tcp.h> #include <linux/ptp_classify.h> #include <net/flow_dissector.h> #include <net/pkt_cls.h> #include <scsi/fc/fc_fcoe.h> #include <uapi/linux/batadv_packet.h> #include <linux/bpf.h> #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_labels.h> #endif #include <linux/bpf-netns.h> static void dissector_set_key(struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) { flow_dissector->used_keys |= (1ULL << key_id); } void skb_flow_dissector_init(struct flow_dissector *flow_dissector, const struct flow_dissector_key *key, unsigned int key_count) { unsigned int i; memset(flow_dissector, 0, sizeof(*flow_dissector)); for (i = 0; i < key_count; i++, key++) { /* User should make sure that every key target offset is within * boundaries of unsigned short. */ BUG_ON(key->offset > USHRT_MAX); BUG_ON(dissector_uses_key(flow_dissector, key->key_id)); dissector_set_key(flow_dissector, key->key_id); flow_dissector->offset[key->key_id] = key->offset; } /* Ensure that the dissector always includes control and basic key. * That way we are able to avoid handling lack of these in fast path. */ BUG_ON(!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_CONTROL)); BUG_ON(!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_BASIC)); } EXPORT_SYMBOL(skb_flow_dissector_init); #ifdef CONFIG_BPF_SYSCALL int flow_dissector_bpf_prog_attach_check(struct net *net, struct bpf_prog *prog) { enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; if (net == &init_net) { /* BPF flow dissector in the root namespace overrides * any per-net-namespace one. When attaching to root, * make sure we don't have any BPF program attached * to the non-root namespaces. */ struct net *ns; for_each_net(ns) { if (ns == &init_net) continue; if (rcu_access_pointer(ns->bpf.run_array[type])) return -EEXIST; } } else { /* Make sure root flow dissector is not attached * when attaching to the non-root namespace. */ if (rcu_access_pointer(init_net.bpf.run_array[type])) return -EEXIST; } return 0; } #endif /* CONFIG_BPF_SYSCALL */ /** * skb_flow_get_ports - extract the upper layer ports and return them * @skb: sk_buff to extract the ports from * @thoff: transport header offset * @ip_proto: protocol for which to get port offset * @data: raw buffer pointer to the packet, if NULL use skb->data * @hlen: packet header length, if @data is NULL use skb_headlen(skb) * * The function will try to retrieve the ports at offset thoff + poff where poff * is the protocol port offset returned from proto_ports_offset */ __be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, const void *data, int hlen) { int poff = proto_ports_offset(ip_proto); if (!data) { data = skb->data; hlen = skb_headlen(skb); } if (poff >= 0) { __be32 *ports, _ports; ports = __skb_header_pointer(skb, thoff + poff, sizeof(_ports), data, hlen, &_ports); if (ports) return *ports; } return 0; } EXPORT_SYMBOL(skb_flow_get_ports); static bool icmp_has_id(u8 type) { switch (type) { case ICMP_ECHO: case ICMP_ECHOREPLY: case ICMP_TIMESTAMP: case ICMP_TIMESTAMPREPLY: case ICMPV6_ECHO_REQUEST: case ICMPV6_ECHO_REPLY: return true; } return false; } /** * skb_flow_get_icmp_tci - extract ICMP(6) Type, Code and Identifier fields * @skb: sk_buff to extract from * @key_icmp: struct flow_dissector_key_icmp to fill * @data: raw buffer pointer to the packet * @thoff: offset to extract at * @hlen: packet header length */ void skb_flow_get_icmp_tci(const struct sk_buff *skb, struct flow_dissector_key_icmp *key_icmp, const void *data, int thoff, int hlen) { struct icmphdr *ih, _ih; ih = __skb_header_pointer(skb, thoff, sizeof(_ih), data, hlen, &_ih); if (!ih) return; key_icmp->type = ih->type; key_icmp->code = ih->code; /* As we use 0 to signal that the Id field is not present, * avoid confusion with packets without such field */ if (icmp_has_id(ih->type)) key_icmp->id = ih->un.echo.id ? ntohs(ih->un.echo.id) : 1; else key_icmp->id = 0; } EXPORT_SYMBOL(skb_flow_get_icmp_tci); /* If FLOW_DISSECTOR_KEY_ICMP is set, dissect an ICMP packet * using skb_flow_get_icmp_tci(). */ static void __skb_flow_dissect_icmp(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int thoff, int hlen) { struct flow_dissector_key_icmp *key_icmp; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ICMP)) return; key_icmp = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ICMP, target_container); skb_flow_get_icmp_tci(skb, key_icmp, data, thoff, hlen); } static void __skb_flow_dissect_ah(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int nhoff, int hlen) { struct flow_dissector_key_ipsec *key_ah; struct ip_auth_hdr _hdr, *hdr; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC)) return; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return; key_ah = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC, target_container); key_ah->spi = hdr->spi; } static void __skb_flow_dissect_esp(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int nhoff, int hlen) { struct flow_dissector_key_ipsec *key_esp; struct ip_esp_hdr _hdr, *hdr; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC)) return; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return; key_esp = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC, target_container); key_esp->spi = hdr->spi; } static void __skb_flow_dissect_l2tpv3(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int nhoff, int hlen) { struct flow_dissector_key_l2tpv3 *key_l2tpv3; struct { __be32 session_id; } *hdr, _hdr; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_L2TPV3)) return; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return; key_l2tpv3 = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_L2TPV3, target_container); key_l2tpv3->session_id = hdr->session_id; } void skb_flow_dissect_meta(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container) { struct flow_dissector_key_meta *meta; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_META)) return; meta = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_META, target_container); meta->ingress_ifindex = skb->skb_iif; #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) if (tc_skb_ext_tc_enabled()) { struct tc_skb_ext *ext; ext = skb_ext_find(skb, TC_SKB_EXT); if (ext) meta->l2_miss = ext->l2_miss; } #endif } EXPORT_SYMBOL(skb_flow_dissect_meta); static void skb_flow_dissect_set_enc_control(enum flow_dissector_key_id type, u32 ctrl_flags, struct flow_dissector *flow_dissector, void *target_container) { struct flow_dissector_key_control *ctrl; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_CONTROL)) return; ctrl = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ENC_CONTROL, target_container); ctrl->addr_type = type; ctrl->flags = ctrl_flags; } void skb_flow_dissect_ct(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, u16 *ctinfo_map, size_t mapsize, bool post_ct, u16 zone) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) struct flow_dissector_key_ct *key; enum ip_conntrack_info ctinfo; struct nf_conn_labels *cl; struct nf_conn *ct; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_CT)) return; ct = nf_ct_get(skb, &ctinfo); if (!ct && !post_ct) return; key = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_CT, target_container); if (!ct) { key->ct_state = TCA_FLOWER_KEY_CT_FLAGS_TRACKED | TCA_FLOWER_KEY_CT_FLAGS_INVALID; key->ct_zone = zone; return; } if (ctinfo < mapsize) key->ct_state = ctinfo_map[ctinfo]; #if IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) key->ct_zone = ct->zone.id; #endif #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) key->ct_mark = READ_ONCE(ct->mark); #endif cl = nf_ct_labels_find(ct); if (cl) memcpy(key->ct_labels, cl->bits, sizeof(key->ct_labels)); #endif /* CONFIG_NF_CONNTRACK */ } EXPORT_SYMBOL(skb_flow_dissect_ct); void skb_flow_dissect_tunnel_info(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container) { struct ip_tunnel_info *info; struct ip_tunnel_key *key; u32 ctrl_flags = 0; /* A quick check to see if there might be something to do. */ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_KEYID) && !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) && !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) && !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_CONTROL) && !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_PORTS) && !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IP) && !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS)) return; info = skb_tunnel_info(skb); if (!info) return; key = &info->key; if (test_bit(IP_TUNNEL_CSUM_BIT, key->tun_flags)) ctrl_flags |= FLOW_DIS_F_TUNNEL_CSUM; if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags)) ctrl_flags |= FLOW_DIS_F_TUNNEL_DONT_FRAGMENT; if (test_bit(IP_TUNNEL_OAM_BIT, key->tun_flags)) ctrl_flags |= FLOW_DIS_F_TUNNEL_OAM; if (test_bit(IP_TUNNEL_CRIT_OPT_BIT, key->tun_flags)) ctrl_flags |= FLOW_DIS_F_TUNNEL_CRIT_OPT; switch (ip_tunnel_info_af(info)) { case AF_INET: skb_flow_dissect_set_enc_control(FLOW_DISSECTOR_KEY_IPV4_ADDRS, ctrl_flags, flow_dissector, target_container); if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS)) { struct flow_dissector_key_ipv4_addrs *ipv4; ipv4 = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, target_container); ipv4->src = key->u.ipv4.src; ipv4->dst = key->u.ipv4.dst; } break; case AF_INET6: skb_flow_dissect_set_enc_control(FLOW_DISSECTOR_KEY_IPV6_ADDRS, ctrl_flags, flow_dissector, target_container); if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS)) { struct flow_dissector_key_ipv6_addrs *ipv6; ipv6 = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, target_container); ipv6->src = key->u.ipv6.src; ipv6->dst = key->u.ipv6.dst; } break; default: skb_flow_dissect_set_enc_control(0, ctrl_flags, flow_dissector, target_container); break; } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_KEYID)) { struct flow_dissector_key_keyid *keyid; keyid = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ENC_KEYID, target_container); keyid->keyid = tunnel_id_to_key32(key->tun_id); } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_PORTS)) { struct flow_dissector_key_ports *tp; tp = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ENC_PORTS, target_container); tp->src = key->tp_src; tp->dst = key->tp_dst; } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IP)) { struct flow_dissector_key_ip *ip; ip = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IP, target_container); ip->tos = key->tos; ip->ttl = key->ttl; } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS)) { struct flow_dissector_key_enc_opts *enc_opt; IP_TUNNEL_DECLARE_FLAGS(flags) = { }; u32 val; enc_opt = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS, target_container); if (!info->options_len) return; enc_opt->len = info->options_len; ip_tunnel_info_opts_get(enc_opt->data, info); ip_tunnel_set_options_present(flags); ip_tunnel_flags_and(flags, info->key.tun_flags, flags); val = find_next_bit(flags, __IP_TUNNEL_FLAG_NUM, IP_TUNNEL_GENEVE_OPT_BIT); enc_opt->dst_opt_type = val < __IP_TUNNEL_FLAG_NUM ? val : 0; } } EXPORT_SYMBOL(skb_flow_dissect_tunnel_info); void skb_flow_dissect_hash(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container) { struct flow_dissector_key_hash *key; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_HASH)) return; key = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_HASH, target_container); key->hash = skb_get_hash_raw(skb); } EXPORT_SYMBOL(skb_flow_dissect_hash); static enum flow_dissect_ret __skb_flow_dissect_mpls(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int nhoff, int hlen, int lse_index, bool *entropy_label) { struct mpls_label *hdr, _hdr; u32 entry, label, bos; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS_ENTROPY) && !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS)) return FLOW_DISSECT_RET_OUT_GOOD; if (lse_index >= FLOW_DIS_MPLS_MAX) return FLOW_DISSECT_RET_OUT_GOOD; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return FLOW_DISSECT_RET_OUT_BAD; entry = ntohl(hdr->entry); label = (entry & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT; bos = (entry & MPLS_LS_S_MASK) >> MPLS_LS_S_SHIFT; if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS)) { struct flow_dissector_key_mpls *key_mpls; struct flow_dissector_mpls_lse *lse; key_mpls = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_MPLS, target_container); lse = &key_mpls->ls[lse_index]; lse->mpls_ttl = (entry & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT; lse->mpls_bos = bos; lse->mpls_tc = (entry & MPLS_LS_TC_MASK) >> MPLS_LS_TC_SHIFT; lse->mpls_label = label; dissector_set_mpls_lse(key_mpls, lse_index); } if (*entropy_label && dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) { struct flow_dissector_key_keyid *key_keyid; key_keyid = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_MPLS_ENTROPY, target_container); key_keyid->keyid = cpu_to_be32(label); } *entropy_label = label == MPLS_LABEL_ENTROPY; return bos ? FLOW_DISSECT_RET_OUT_GOOD : FLOW_DISSECT_RET_PROTO_AGAIN; } static enum flow_dissect_ret __skb_flow_dissect_arp(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int nhoff, int hlen) { struct flow_dissector_key_arp *key_arp; struct { unsigned char ar_sha[ETH_ALEN]; unsigned char ar_sip[4]; unsigned char ar_tha[ETH_ALEN]; unsigned char ar_tip[4]; } *arp_eth, _arp_eth; const struct arphdr *arp; struct arphdr _arp; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ARP)) return FLOW_DISSECT_RET_OUT_GOOD; arp = __skb_header_pointer(skb, nhoff, sizeof(_arp), data, hlen, &_arp); if (!arp) return FLOW_DISSECT_RET_OUT_BAD; if (arp->ar_hrd != htons(ARPHRD_ETHER) || arp->ar_pro != htons(ETH_P_IP) || arp->ar_hln != ETH_ALEN || arp->ar_pln != 4 || (arp->ar_op != htons(ARPOP_REPLY) && arp->ar_op != htons(ARPOP_REQUEST))) return FLOW_DISSECT_RET_OUT_BAD; arp_eth = __skb_header_pointer(skb, nhoff + sizeof(_arp), sizeof(_arp_eth), data, hlen, &_arp_eth); if (!arp_eth) return FLOW_DISSECT_RET_OUT_BAD; key_arp = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ARP, target_container); memcpy(&key_arp->sip, arp_eth->ar_sip, sizeof(key_arp->sip)); memcpy(&key_arp->tip, arp_eth->ar_tip, sizeof(key_arp->tip)); /* Only store the lower byte of the opcode; * this covers ARPOP_REPLY and ARPOP_REQUEST. */ key_arp->op = ntohs(arp->ar_op) & 0xff; ether_addr_copy(key_arp->sha, arp_eth->ar_sha); ether_addr_copy(key_arp->tha, arp_eth->ar_tha); return FLOW_DISSECT_RET_OUT_GOOD; } static enum flow_dissect_ret __skb_flow_dissect_cfm(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int nhoff, int hlen) { struct flow_dissector_key_cfm *key, *hdr, _hdr; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_CFM)) return FLOW_DISSECT_RET_OUT_GOOD; hdr = __skb_header_pointer(skb, nhoff, sizeof(*key), data, hlen, &_hdr); if (!hdr) return FLOW_DISSECT_RET_OUT_BAD; key = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_CFM, target_container); key->mdl_ver = hdr->mdl_ver; key->opcode = hdr->opcode; return FLOW_DISSECT_RET_OUT_GOOD; } static enum flow_dissect_ret __skb_flow_dissect_gre(const struct sk_buff *skb, struct flow_dissector_key_control *key_control, struct flow_dissector *flow_dissector, void *target_container, const void *data, __be16 *p_proto, int *p_nhoff, int *p_hlen, unsigned int flags) { struct flow_dissector_key_keyid *key_keyid; struct gre_base_hdr *hdr, _hdr; int offset = 0; u16 gre_ver; hdr = __skb_header_pointer(skb, *p_nhoff, sizeof(_hdr), data, *p_hlen, &_hdr); if (!hdr) return FLOW_DISSECT_RET_OUT_BAD; /* Only look inside GRE without routing */ if (hdr->flags & GRE_ROUTING) return FLOW_DISSECT_RET_OUT_GOOD; /* Only look inside GRE for version 0 and 1 */ gre_ver = ntohs(hdr->flags & GRE_VERSION); if (gre_ver > 1) return FLOW_DISSECT_RET_OUT_GOOD; *p_proto = hdr->protocol; if (gre_ver) { /* Version1 must be PPTP, and check the flags */ if (!(*p_proto == GRE_PROTO_PPP && (hdr->flags & GRE_KEY))) return FLOW_DISSECT_RET_OUT_GOOD; } offset += sizeof(struct gre_base_hdr); if (hdr->flags & GRE_CSUM) offset += sizeof_field(struct gre_full_hdr, csum) + sizeof_field(struct gre_full_hdr, reserved1); if (hdr->flags & GRE_KEY) { const __be32 *keyid; __be32 _keyid; keyid = __skb_header_pointer(skb, *p_nhoff + offset, sizeof(_keyid), data, *p_hlen, &_keyid); if (!keyid) return FLOW_DISSECT_RET_OUT_BAD; if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_GRE_KEYID)) { key_keyid = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_GRE_KEYID, target_container); if (gre_ver == 0) key_keyid->keyid = *keyid; else key_keyid->keyid = *keyid & GRE_PPTP_KEY_MASK; } offset += sizeof_field(struct gre_full_hdr, key); } if (hdr->flags & GRE_SEQ) offset += sizeof_field(struct pptp_gre_header, seq); if (gre_ver == 0) { if (*p_proto == htons(ETH_P_TEB)) { const struct ethhdr *eth; struct ethhdr _eth; eth = __skb_header_pointer(skb, *p_nhoff + offset, sizeof(_eth), data, *p_hlen, &_eth); if (!eth) return FLOW_DISSECT_RET_OUT_BAD; *p_proto = eth->h_proto; offset += sizeof(*eth); /* Cap headers that we access via pointers at the * end of the Ethernet header as our maximum alignment * at that point is only 2 bytes. */ if (NET_IP_ALIGN) *p_hlen = *p_nhoff + offset; } } else { /* version 1, must be PPTP */ u8 _ppp_hdr[PPP_HDRLEN]; u8 *ppp_hdr; if (hdr->flags & GRE_ACK) offset += sizeof_field(struct pptp_gre_header, ack); ppp_hdr = __skb_header_pointer(skb, *p_nhoff + offset, sizeof(_ppp_hdr), data, *p_hlen, _ppp_hdr); if (!ppp_hdr) return FLOW_DISSECT_RET_OUT_BAD; switch (PPP_PROTOCOL(ppp_hdr)) { case PPP_IP: *p_proto = htons(ETH_P_IP); break; case PPP_IPV6: *p_proto = htons(ETH_P_IPV6); break; default: /* Could probably catch some more like MPLS */ break; } offset += PPP_HDRLEN; } *p_nhoff += offset; key_control->flags |= FLOW_DIS_ENCAPSULATION; if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) return FLOW_DISSECT_RET_OUT_GOOD; return FLOW_DISSECT_RET_PROTO_AGAIN; } /** * __skb_flow_dissect_batadv() - dissect batman-adv header * @skb: sk_buff to with the batman-adv header * @key_control: flow dissectors control key * @data: raw buffer pointer to the packet, if NULL use skb->data * @p_proto: pointer used to update the protocol to process next * @p_nhoff: pointer used to update inner network header offset * @hlen: packet header length * @flags: any combination of FLOW_DISSECTOR_F_* * * ETH_P_BATMAN packets are tried to be dissected. Only * &struct batadv_unicast packets are actually processed because they contain an * inner ethernet header and are usually followed by actual network header. This * allows the flow dissector to continue processing the packet. * * Return: FLOW_DISSECT_RET_PROTO_AGAIN when &struct batadv_unicast was found, * FLOW_DISSECT_RET_OUT_GOOD when dissector should stop after encapsulation, * otherwise FLOW_DISSECT_RET_OUT_BAD */ static enum flow_dissect_ret __skb_flow_dissect_batadv(const struct sk_buff *skb, struct flow_dissector_key_control *key_control, const void *data, __be16 *p_proto, int *p_nhoff, int hlen, unsigned int flags) { struct { struct batadv_unicast_packet batadv_unicast; struct ethhdr eth; } *hdr, _hdr; hdr = __skb_header_pointer(skb, *p_nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return FLOW_DISSECT_RET_OUT_BAD; if (hdr->batadv_unicast.version != BATADV_COMPAT_VERSION) return FLOW_DISSECT_RET_OUT_BAD; if (hdr->batadv_unicast.packet_type != BATADV_UNICAST) return FLOW_DISSECT_RET_OUT_BAD; *p_proto = hdr->eth.h_proto; *p_nhoff += sizeof(*hdr); key_control->flags |= FLOW_DIS_ENCAPSULATION; if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) return FLOW_DISSECT_RET_OUT_GOOD; return FLOW_DISSECT_RET_PROTO_AGAIN; } static void __skb_flow_dissect_tcp(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int thoff, int hlen) { struct flow_dissector_key_tcp *key_tcp; struct tcphdr *th, _th; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_TCP)) return; th = __skb_header_pointer(skb, thoff, sizeof(_th), data, hlen, &_th); if (!th) return; if (unlikely(__tcp_hdrlen(th) < sizeof(_th))) return; key_tcp = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_TCP, target_container); key_tcp->flags = (*(__be16 *) &tcp_flag_word(th) & htons(0x0FFF)); } static void __skb_flow_dissect_ports(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int nhoff, u8 ip_proto, int hlen) { struct flow_dissector_key_ports_range *key_ports_range = NULL; struct flow_dissector_key_ports *key_ports = NULL; __be32 ports; if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) key_ports = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_PORTS, target_container); if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS_RANGE)) key_ports_range = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_PORTS_RANGE, target_container); if (!key_ports && !key_ports_range) return; ports = skb_flow_get_ports(skb, nhoff, ip_proto, data, hlen); if (key_ports) key_ports->ports = ports; if (key_ports_range) key_ports_range->tp.ports = ports; } static void __skb_flow_dissect_ipv4(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, const struct iphdr *iph) { struct flow_dissector_key_ip *key_ip; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IP)) return; key_ip = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IP, target_container); key_ip->tos = iph->tos; key_ip->ttl = iph->ttl; } static void __skb_flow_dissect_ipv6(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, const struct ipv6hdr *iph) { struct flow_dissector_key_ip *key_ip; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IP)) return; key_ip = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IP, target_container); key_ip->tos = ipv6_get_dsfield(iph); key_ip->ttl = iph->hop_limit; } /* Maximum number of protocol headers that can be parsed in * __skb_flow_dissect */ #define MAX_FLOW_DISSECT_HDRS 15 static bool skb_flow_dissect_allowed(int *num_hdrs) { ++*num_hdrs; return (*num_hdrs <= MAX_FLOW_DISSECT_HDRS); } static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, struct flow_dissector *flow_dissector, void *target_container) { struct flow_dissector_key_ports_range *key_ports_range = NULL; struct flow_dissector_key_ports *key_ports = NULL; struct flow_dissector_key_control *key_control; struct flow_dissector_key_basic *key_basic; struct flow_dissector_key_addrs *key_addrs; struct flow_dissector_key_tags *key_tags; key_control = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_CONTROL, target_container); key_control->thoff = flow_keys->thoff; if (flow_keys->is_frag) key_control->flags |= FLOW_DIS_IS_FRAGMENT; if (flow_keys->is_first_frag) key_control->flags |= FLOW_DIS_FIRST_FRAG; if (flow_keys->is_encap) key_control->flags |= FLOW_DIS_ENCAPSULATION; key_basic = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_BASIC, target_container); key_basic->n_proto = flow_keys->n_proto; key_basic->ip_proto = flow_keys->ip_proto; if (flow_keys->addr_proto == ETH_P_IP && dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS)) { key_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS, target_container); key_addrs->v4addrs.src = flow_keys->ipv4_src; key_addrs->v4addrs.dst = flow_keys->ipv4_dst; key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; } else if (flow_keys->addr_proto == ETH_P_IPV6 && dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV6_ADDRS)) { key_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IPV6_ADDRS, target_container); memcpy(&key_addrs->v6addrs.src, &flow_keys->ipv6_src, sizeof(key_addrs->v6addrs.src)); memcpy(&key_addrs->v6addrs.dst, &flow_keys->ipv6_dst, sizeof(key_addrs->v6addrs.dst)); key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) { key_ports = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_PORTS, target_container); key_ports->src = flow_keys->sport; key_ports->dst = flow_keys->dport; } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS_RANGE)) { key_ports_range = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_PORTS_RANGE, target_container); key_ports_range->tp.src = flow_keys->sport; key_ports_range->tp.dst = flow_keys->dport; } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_FLOW_LABEL)) { key_tags = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_FLOW_LABEL, target_container); key_tags->flow_label = ntohl(flow_keys->flow_label); } } u32 bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, __be16 proto, int nhoff, int hlen, unsigned int flags) { struct bpf_flow_keys *flow_keys = ctx->flow_keys; u32 result; /* Pass parameters to the BPF program */ memset(flow_keys, 0, sizeof(*flow_keys)); flow_keys->n_proto = proto; flow_keys->nhoff = nhoff; flow_keys->thoff = flow_keys->nhoff; BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG != (int)FLOW_DISSECTOR_F_PARSE_1ST_FRAG); BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL != (int)FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP != (int)FLOW_DISSECTOR_F_STOP_AT_ENCAP); flow_keys->flags = flags; result = bpf_prog_run_pin_on_cpu(prog, ctx); flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff, nhoff, hlen); flow_keys->thoff = clamp_t(u16, flow_keys->thoff, flow_keys->nhoff, hlen); return result; } static bool is_pppoe_ses_hdr_valid(const struct pppoe_hdr *hdr) { return hdr->ver == 1 && hdr->type == 1 && hdr->code == 0; } /** * __skb_flow_dissect - extract the flow_keys struct and return it * @net: associated network namespace, derived from @skb if NULL * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified * @flow_dissector: list of keys to dissect * @target_container: target structure to put dissected values into * @data: raw buffer pointer to the packet, if NULL use skb->data * @proto: protocol for which to get the flow, if @data is NULL use skb->protocol * @nhoff: network header offset, if @data is NULL use skb_network_offset(skb) * @hlen: packet header length, if @data is NULL use skb_headlen(skb) * @flags: flags that control the dissection process, e.g. * FLOW_DISSECTOR_F_STOP_AT_ENCAP. * * The function will try to retrieve individual keys into target specified * by flow_dissector from either the skbuff or a raw buffer specified by the * rest parameters. * * Caller must take care of zeroing target container memory. */ bool __skb_flow_dissect(const struct net *net, const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, __be16 proto, int nhoff, int hlen, unsigned int flags) { struct flow_dissector_key_control *key_control; struct flow_dissector_key_basic *key_basic; struct flow_dissector_key_addrs *key_addrs; struct flow_dissector_key_tags *key_tags; struct flow_dissector_key_vlan *key_vlan; enum flow_dissect_ret fdret; enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX; bool mpls_el = false; int mpls_lse = 0; int num_hdrs = 0; u8 ip_proto = 0; bool ret; if (!data) { data = skb->data; proto = skb_vlan_tag_present(skb) ? skb->vlan_proto : skb->protocol; nhoff = skb_network_offset(skb); hlen = skb_headlen(skb); #if IS_ENABLED(CONFIG_NET_DSA) if (unlikely(skb->dev && netdev_uses_dsa(skb->dev) && proto == htons(ETH_P_XDSA))) { struct metadata_dst *md_dst = skb_metadata_dst(skb); const struct dsa_device_ops *ops; int offset = 0; ops = skb->dev->dsa_ptr->tag_ops; /* Only DSA header taggers break flow dissection */ if (ops->needed_headroom && (!md_dst || md_dst->type != METADATA_HW_PORT_MUX)) { if (ops->flow_dissect) ops->flow_dissect(skb, &proto, &offset); else dsa_tag_generic_flow_dissect(skb, &proto, &offset); hlen -= offset; nhoff += offset; } } #endif } /* It is ensured by skb_flow_dissector_init() that control key will * be always present. */ key_control = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_CONTROL, target_container); /* It is ensured by skb_flow_dissector_init() that basic key will * be always present. */ key_basic = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_BASIC, target_container); rcu_read_lock(); if (skb) { if (!net) { if (skb->dev) net = dev_net_rcu(skb->dev); else if (skb->sk) net = sock_net(skb->sk); } } DEBUG_NET_WARN_ON_ONCE(!net); if (net) { enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; struct bpf_prog_array *run_array; run_array = rcu_dereference(init_net.bpf.run_array[type]); if (!run_array) run_array = rcu_dereference(net->bpf.run_array[type]); if (run_array) { struct bpf_flow_keys flow_keys; struct bpf_flow_dissector ctx = { .flow_keys = &flow_keys, .data = data, .data_end = data + hlen, }; __be16 n_proto = proto; struct bpf_prog *prog; u32 result; if (skb) { ctx.skb = skb; /* we can't use 'proto' in the skb case * because it might be set to skb->vlan_proto * which has been pulled from the data */ n_proto = skb->protocol; } prog = READ_ONCE(run_array->items[0].prog); result = bpf_flow_dissect(prog, &ctx, n_proto, nhoff, hlen, flags); if (result != BPF_FLOW_DISSECTOR_CONTINUE) { __skb_flow_bpf_to_target(&flow_keys, flow_dissector, target_container); rcu_read_unlock(); return result == BPF_OK; } } } rcu_read_unlock(); if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) { struct ethhdr *eth = eth_hdr(skb); struct flow_dissector_key_eth_addrs *key_eth_addrs; key_eth_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS, target_container); memcpy(key_eth_addrs, eth, sizeof(*key_eth_addrs)); } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_NUM_OF_VLANS)) { struct flow_dissector_key_num_of_vlans *key_num_of_vlans; key_num_of_vlans = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_NUM_OF_VLANS, target_container); key_num_of_vlans->num_of_vlans = 0; } proto_again: fdret = FLOW_DISSECT_RET_CONTINUE; switch (proto) { case htons(ETH_P_IP): { const struct iphdr *iph; struct iphdr _iph; iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); if (!iph || iph->ihl < 5) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } nhoff += iph->ihl * 4; ip_proto = iph->protocol; if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS)) { key_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS, target_container); memcpy(&key_addrs->v4addrs.src, &iph->saddr, sizeof(key_addrs->v4addrs.src)); memcpy(&key_addrs->v4addrs.dst, &iph->daddr, sizeof(key_addrs->v4addrs.dst)); key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; } __skb_flow_dissect_ipv4(skb, flow_dissector, target_container, data, iph); if (ip_is_fragment(iph)) { key_control->flags |= FLOW_DIS_IS_FRAGMENT; if (iph->frag_off & htons(IP_OFFSET)) { fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } else { key_control->flags |= FLOW_DIS_FIRST_FRAG; if (!(flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG)) { fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } } } break; } case htons(ETH_P_IPV6): { const struct ipv6hdr *iph; struct ipv6hdr _iph; iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); if (!iph) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } ip_proto = iph->nexthdr; nhoff += sizeof(struct ipv6hdr); if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV6_ADDRS)) { key_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IPV6_ADDRS, target_container); memcpy(&key_addrs->v6addrs.src, &iph->saddr, sizeof(key_addrs->v6addrs.src)); memcpy(&key_addrs->v6addrs.dst, &iph->daddr, sizeof(key_addrs->v6addrs.dst)); key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } if ((dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_FLOW_LABEL) || (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL)) && ip6_flowlabel(iph)) { __be32 flow_label = ip6_flowlabel(iph); if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_FLOW_LABEL)) { key_tags = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_FLOW_LABEL, target_container); key_tags->flow_label = ntohl(flow_label); } if (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL) { fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } } __skb_flow_dissect_ipv6(skb, flow_dissector, target_container, data, iph); break; } case htons(ETH_P_8021AD): case htons(ETH_P_8021Q): { const struct vlan_hdr *vlan = NULL; struct vlan_hdr _vlan; __be16 saved_vlan_tpid = proto; if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX && skb && skb_vlan_tag_present(skb)) { proto = skb->protocol; } else { vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), data, hlen, &_vlan); if (!vlan) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } proto = vlan->h_vlan_encapsulated_proto; nhoff += sizeof(*vlan); } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_NUM_OF_VLANS) && !(key_control->flags & FLOW_DIS_ENCAPSULATION)) { struct flow_dissector_key_num_of_vlans *key_nvs; key_nvs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_NUM_OF_VLANS, target_container); key_nvs->num_of_vlans++; } if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX) { dissector_vlan = FLOW_DISSECTOR_KEY_VLAN; } else if (dissector_vlan == FLOW_DISSECTOR_KEY_VLAN) { dissector_vlan = FLOW_DISSECTOR_KEY_CVLAN; } else { fdret = FLOW_DISSECT_RET_PROTO_AGAIN; break; } if (dissector_uses_key(flow_dissector, dissector_vlan)) { key_vlan = skb_flow_dissector_target(flow_dissector, dissector_vlan, target_container); if (!vlan) { key_vlan->vlan_id = skb_vlan_tag_get_id(skb); key_vlan->vlan_priority = skb_vlan_tag_get_prio(skb); } else { key_vlan->vlan_id = ntohs(vlan->h_vlan_TCI) & VLAN_VID_MASK; key_vlan->vlan_priority = (ntohs(vlan->h_vlan_TCI) & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; } key_vlan->vlan_tpid = saved_vlan_tpid; key_vlan->vlan_eth_type = proto; } fdret = FLOW_DISSECT_RET_PROTO_AGAIN; break; } case htons(ETH_P_PPP_SES): { struct { struct pppoe_hdr hdr; __be16 proto; } *hdr, _hdr; u16 ppp_proto; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } if (!is_pppoe_ses_hdr_valid(&hdr->hdr)) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } /* least significant bit of the most significant octet * indicates if protocol field was compressed */ ppp_proto = ntohs(hdr->proto); if (ppp_proto & 0x0100) { ppp_proto = ppp_proto >> 8; nhoff += PPPOE_SES_HLEN - 1; } else { nhoff += PPPOE_SES_HLEN; } if (ppp_proto == PPP_IP) { proto = htons(ETH_P_IP); fdret = FLOW_DISSECT_RET_PROTO_AGAIN; } else if (ppp_proto == PPP_IPV6) { proto = htons(ETH_P_IPV6); fdret = FLOW_DISSECT_RET_PROTO_AGAIN; } else if (ppp_proto == PPP_MPLS_UC) { proto = htons(ETH_P_MPLS_UC); fdret = FLOW_DISSECT_RET_PROTO_AGAIN; } else if (ppp_proto == PPP_MPLS_MC) { proto = htons(ETH_P_MPLS_MC); fdret = FLOW_DISSECT_RET_PROTO_AGAIN; } else if (ppp_proto_is_valid(ppp_proto)) { fdret = FLOW_DISSECT_RET_OUT_GOOD; } else { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PPPOE)) { struct flow_dissector_key_pppoe *key_pppoe; key_pppoe = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_PPPOE, target_container); key_pppoe->session_id = hdr->hdr.sid; key_pppoe->ppp_proto = htons(ppp_proto); key_pppoe->type = htons(ETH_P_PPP_SES); } break; } case htons(ETH_P_TIPC): { struct tipc_basic_hdr *hdr, _hdr; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_TIPC)) { key_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_TIPC, target_container); key_addrs->tipckey.key = tipc_hdr_rps_key(hdr); key_control->addr_type = FLOW_DISSECTOR_KEY_TIPC; } fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } case htons(ETH_P_MPLS_UC): case htons(ETH_P_MPLS_MC): fdret = __skb_flow_dissect_mpls(skb, flow_dissector, target_container, data, nhoff, hlen, mpls_lse, &mpls_el); nhoff += sizeof(struct mpls_label); mpls_lse++; break; case htons(ETH_P_FCOE): if ((hlen - nhoff) < FCOE_HEADER_LEN) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } nhoff += FCOE_HEADER_LEN; fdret = FLOW_DISSECT_RET_OUT_GOOD; break; case htons(ETH_P_ARP): case htons(ETH_P_RARP): fdret = __skb_flow_dissect_arp(skb, flow_dissector, target_container, data, nhoff, hlen); break; case htons(ETH_P_BATMAN): fdret = __skb_flow_dissect_batadv(skb, key_control, data, &proto, &nhoff, hlen, flags); break; case htons(ETH_P_1588): { struct ptp_header *hdr, _hdr; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } nhoff += sizeof(struct ptp_header); fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } case htons(ETH_P_PRP): case htons(ETH_P_HSR): { struct hsr_tag *hdr, _hdr; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } proto = hdr->encap_proto; nhoff += HSR_HLEN; fdret = FLOW_DISSECT_RET_PROTO_AGAIN; break; } case htons(ETH_P_CFM): fdret = __skb_flow_dissect_cfm(skb, flow_dissector, target_container, data, nhoff, hlen); break; default: fdret = FLOW_DISSECT_RET_OUT_BAD; break; } /* Process result of proto processing */ switch (fdret) { case FLOW_DISSECT_RET_OUT_GOOD: goto out_good; case FLOW_DISSECT_RET_PROTO_AGAIN: if (skb_flow_dissect_allowed(&num_hdrs)) goto proto_again; goto out_good; case FLOW_DISSECT_RET_CONTINUE: case FLOW_DISSECT_RET_IPPROTO_AGAIN: break; case FLOW_DISSECT_RET_OUT_BAD: default: goto out_bad; } ip_proto_again: fdret = FLOW_DISSECT_RET_CONTINUE; switch (ip_proto) { case IPPROTO_GRE: if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) { fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } fdret = __skb_flow_dissect_gre(skb, key_control, flow_dissector, target_container, data, &proto, &nhoff, &hlen, flags); break; case NEXTHDR_HOP: case NEXTHDR_ROUTING: case NEXTHDR_DEST: { u8 _opthdr[2], *opthdr; if (proto != htons(ETH_P_IPV6)) break; opthdr = __skb_header_pointer(skb, nhoff, sizeof(_opthdr), data, hlen, &_opthdr); if (!opthdr) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } ip_proto = opthdr[0]; nhoff += (opthdr[1] + 1) << 3; fdret = FLOW_DISSECT_RET_IPPROTO_AGAIN; break; } case NEXTHDR_FRAGMENT: { struct frag_hdr _fh, *fh; if (proto != htons(ETH_P_IPV6)) break; fh = __skb_header_pointer(skb, nhoff, sizeof(_fh), data, hlen, &_fh); if (!fh) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } key_control->flags |= FLOW_DIS_IS_FRAGMENT; nhoff += sizeof(_fh); ip_proto = fh->nexthdr; if (!(fh->frag_off & htons(IP6_OFFSET))) { key_control->flags |= FLOW_DIS_FIRST_FRAG; if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG) { fdret = FLOW_DISSECT_RET_IPPROTO_AGAIN; break; } } fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } case IPPROTO_IPIP: if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) { fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } proto = htons(ETH_P_IP); key_control->flags |= FLOW_DIS_ENCAPSULATION; if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) { fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } fdret = FLOW_DISSECT_RET_PROTO_AGAIN; break; case IPPROTO_IPV6: if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) { fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } proto = htons(ETH_P_IPV6); key_control->flags |= FLOW_DIS_ENCAPSULATION; if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) { fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } fdret = FLOW_DISSECT_RET_PROTO_AGAIN; break; case IPPROTO_MPLS: proto = htons(ETH_P_MPLS_UC); fdret = FLOW_DISSECT_RET_PROTO_AGAIN; break; case IPPROTO_TCP: __skb_flow_dissect_tcp(skb, flow_dissector, target_container, data, nhoff, hlen); break; case IPPROTO_ICMP: case IPPROTO_ICMPV6: __skb_flow_dissect_icmp(skb, flow_dissector, target_container, data, nhoff, hlen); break; case IPPROTO_L2TP: __skb_flow_dissect_l2tpv3(skb, flow_dissector, target_container, data, nhoff, hlen); break; case IPPROTO_ESP: __skb_flow_dissect_esp(skb, flow_dissector, target_container, data, nhoff, hlen); break; case IPPROTO_AH: __skb_flow_dissect_ah(skb, flow_dissector, target_container, data, nhoff, hlen); break; default: break; } if (!(key_control->flags & FLOW_DIS_IS_FRAGMENT)) __skb_flow_dissect_ports(skb, flow_dissector, target_container, data, nhoff, ip_proto, hlen); /* Process result of IP proto processing */ switch (fdret) { case FLOW_DISSECT_RET_PROTO_AGAIN: if (skb_flow_dissect_allowed(&num_hdrs)) goto proto_again; break; case FLOW_DISSECT_RET_IPPROTO_AGAIN: if (skb_flow_dissect_allowed(&num_hdrs)) goto ip_proto_again; break; case FLOW_DISSECT_RET_OUT_GOOD: case FLOW_DISSECT_RET_CONTINUE: break; case FLOW_DISSECT_RET_OUT_BAD: default: goto out_bad; } out_good: ret = true; out: key_control->thoff = min_t(u16, nhoff, skb ? skb->len : hlen); key_basic->n_proto = proto; key_basic->ip_proto = ip_proto; return ret; out_bad: ret = false; goto out; } EXPORT_SYMBOL(__skb_flow_dissect); static siphash_aligned_key_t hashrnd; static __always_inline void __flow_hash_secret_init(void) { net_get_random_once(&hashrnd, sizeof(hashrnd)); } static const void *flow_keys_hash_start(const struct flow_keys *flow) { BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % SIPHASH_ALIGNMENT); return &flow->FLOW_KEYS_HASH_START_FIELD; } static inline size_t flow_keys_hash_length(const struct flow_keys *flow) { size_t diff = FLOW_KEYS_HASH_OFFSET + sizeof(flow->addrs); BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32)); switch (flow->control.addr_type) { case FLOW_DISSECTOR_KEY_IPV4_ADDRS: diff -= sizeof(flow->addrs.v4addrs); break; case FLOW_DISSECTOR_KEY_IPV6_ADDRS: diff -= sizeof(flow->addrs.v6addrs); break; case FLOW_DISSECTOR_KEY_TIPC: diff -= sizeof(flow->addrs.tipckey); break; } return sizeof(*flow) - diff; } __be32 flow_get_u32_src(const struct flow_keys *flow) { switch (flow->control.addr_type) { case FLOW_DISSECTOR_KEY_IPV4_ADDRS: return flow->addrs.v4addrs.src; case FLOW_DISSECTOR_KEY_IPV6_ADDRS: return (__force __be32)ipv6_addr_hash( &flow->addrs.v6addrs.src); case FLOW_DISSECTOR_KEY_TIPC: return flow->addrs.tipckey.key; default: return 0; } } EXPORT_SYMBOL(flow_get_u32_src); __be32 flow_get_u32_dst(const struct flow_keys *flow) { switch (flow->control.addr_type) { case FLOW_DISSECTOR_KEY_IPV4_ADDRS: return flow->addrs.v4addrs.dst; case FLOW_DISSECTOR_KEY_IPV6_ADDRS: return (__force __be32)ipv6_addr_hash( &flow->addrs.v6addrs.dst); default: return 0; } } EXPORT_SYMBOL(flow_get_u32_dst); /* Sort the source and destination IP and the ports, * to have consistent hash within the two directions */ static inline void __flow_hash_consistentify(struct flow_keys *keys) { int addr_diff, i; switch (keys->control.addr_type) { case FLOW_DISSECTOR_KEY_IPV4_ADDRS: if ((__force u32)keys->addrs.v4addrs.dst < (__force u32)keys->addrs.v4addrs.src) swap(keys->addrs.v4addrs.src, keys->addrs.v4addrs.dst); if ((__force u16)keys->ports.dst < (__force u16)keys->ports.src) { swap(keys->ports.src, keys->ports.dst); } break; case FLOW_DISSECTOR_KEY_IPV6_ADDRS: addr_diff = memcmp(&keys->addrs.v6addrs.dst, &keys->addrs.v6addrs.src, sizeof(keys->addrs.v6addrs.dst)); if (addr_diff < 0) { for (i = 0; i < 4; i++) swap(keys->addrs.v6addrs.src.s6_addr32[i], keys->addrs.v6addrs.dst.s6_addr32[i]); } if ((__force u16)keys->ports.dst < (__force u16)keys->ports.src) { swap(keys->ports.src, keys->ports.dst); } break; } } static inline u32 __flow_hash_from_keys(struct flow_keys *keys, const siphash_key_t *keyval) { u32 hash; __flow_hash_consistentify(keys); hash = siphash(flow_keys_hash_start(keys), flow_keys_hash_length(keys), keyval); if (!hash) hash = 1; return hash; } u32 flow_hash_from_keys(struct flow_keys *keys) { __flow_hash_secret_init(); return __flow_hash_from_keys(keys, &hashrnd); } EXPORT_SYMBOL(flow_hash_from_keys); u32 flow_hash_from_keys_seed(struct flow_keys *keys, const siphash_key_t *keyval) { return __flow_hash_from_keys(keys, keyval); } EXPORT_SYMBOL(flow_hash_from_keys_seed); static inline u32 ___skb_get_hash(const struct sk_buff *skb, struct flow_keys *keys, const siphash_key_t *keyval) { skb_flow_dissect_flow_keys(skb, keys, FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); return __flow_hash_from_keys(keys, keyval); } struct _flow_keys_digest_data { __be16 n_proto; u8 ip_proto; u8 padding; __be32 ports; __be32 src; __be32 dst; }; void make_flow_keys_digest(struct flow_keys_digest *digest, const struct flow_keys *flow) { struct _flow_keys_digest_data *data = (struct _flow_keys_digest_data *)digest; BUILD_BUG_ON(sizeof(*data) > sizeof(*digest)); memset(digest, 0, sizeof(*digest)); data->n_proto = flow->basic.n_proto; data->ip_proto = flow->basic.ip_proto; data->ports = flow->ports.ports; data->src = flow->addrs.v4addrs.src; data->dst = flow->addrs.v4addrs.dst; } EXPORT_SYMBOL(make_flow_keys_digest); static struct flow_dissector flow_keys_dissector_symmetric __read_mostly; u32 __skb_get_hash_symmetric_net(const struct net *net, const struct sk_buff *skb) { struct flow_keys keys; __flow_hash_secret_init(); memset(&keys, 0, sizeof(keys)); __skb_flow_dissect(net, skb, &flow_keys_dissector_symmetric, &keys, NULL, 0, 0, 0, 0); return __flow_hash_from_keys(&keys, &hashrnd); } EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric_net); /** * __skb_get_hash_net: calculate a flow hash * @net: associated network namespace, derived from @skb if NULL * @skb: sk_buff to calculate flow hash from * * This function calculates a flow hash based on src/dst addresses * and src/dst port numbers. Sets hash in skb to non-zero hash value * on success, zero indicates no valid hash. Also, sets l4_hash in skb * if hash is a canonical 4-tuple hash over transport ports. */ void __skb_get_hash_net(const struct net *net, struct sk_buff *skb) { struct flow_keys keys; u32 hash; memset(&keys, 0, sizeof(keys)); __skb_flow_dissect(net, skb, &flow_keys_dissector, &keys, NULL, 0, 0, 0, FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); __flow_hash_secret_init(); hash = __flow_hash_from_keys(&keys, &hashrnd); __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys)); } EXPORT_SYMBOL(__skb_get_hash_net); __u32 skb_get_hash_perturb(const struct sk_buff *skb, const siphash_key_t *perturb) { struct flow_keys keys; return ___skb_get_hash(skb, &keys, perturb); } EXPORT_SYMBOL(skb_get_hash_perturb); u32 __skb_get_poff(const struct sk_buff *skb, const void *data, const struct flow_keys_basic *keys, int hlen) { u32 poff = keys->control.thoff; /* skip L4 headers for fragments after the first */ if ((keys->control.flags & FLOW_DIS_IS_FRAGMENT) && !(keys->control.flags & FLOW_DIS_FIRST_FRAG)) return poff; switch (keys->basic.ip_proto) { case IPPROTO_TCP: { /* access doff as u8 to avoid unaligned access */ const u8 *doff; u8 _doff; doff = __skb_header_pointer(skb, poff + 12, sizeof(_doff), data, hlen, &_doff); if (!doff) return poff; poff += max_t(u32, sizeof(struct tcphdr), (*doff & 0xF0) >> 2); break; } case IPPROTO_UDP: case IPPROTO_UDPLITE: poff += sizeof(struct udphdr); break; /* For the rest, we do not really care about header * extensions at this point for now. */ case IPPROTO_ICMP: poff += sizeof(struct icmphdr); break; case IPPROTO_ICMPV6: poff += sizeof(struct icmp6hdr); break; case IPPROTO_IGMP: poff += sizeof(struct igmphdr); break; case IPPROTO_DCCP: poff += sizeof(struct dccp_hdr); break; case IPPROTO_SCTP: poff += sizeof(struct sctphdr); break; } return poff; } /** * skb_get_poff - get the offset to the payload * @skb: sk_buff to get the payload offset from * * The function will get the offset to the payload as far as it could * be dissected. The main user is currently BPF, so that we can dynamically * truncate packets without needing to push actual payload to the user * space and can analyze headers only, instead. */ u32 skb_get_poff(const struct sk_buff *skb) { struct flow_keys_basic keys; if (!skb_flow_dissect_flow_keys_basic(NULL, skb, &keys, NULL, 0, 0, 0, 0)) return 0; return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb)); } __u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys) { memset(keys, 0, sizeof(*keys)); memcpy(&keys->addrs.v6addrs.src, &fl6->saddr, sizeof(keys->addrs.v6addrs.src)); memcpy(&keys->addrs.v6addrs.dst, &fl6->daddr, sizeof(keys->addrs.v6addrs.dst)); keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; keys->ports.src = fl6->fl6_sport; keys->ports.dst = fl6->fl6_dport; keys->keyid.keyid = fl6->fl6_gre_key; keys->tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); keys->basic.ip_proto = fl6->flowi6_proto; return flow_hash_from_keys(keys); } EXPORT_SYMBOL(__get_hash_from_flowi6); static const struct flow_dissector_key flow_keys_dissector_keys[] = { { .key_id = FLOW_DISSECTOR_KEY_CONTROL, .offset = offsetof(struct flow_keys, control), }, { .key_id = FLOW_DISSECTOR_KEY_BASIC, .offset = offsetof(struct flow_keys, basic), }, { .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS, .offset = offsetof(struct flow_keys, addrs.v4addrs), }, { .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS, .offset = offsetof(struct flow_keys, addrs.v6addrs), }, { .key_id = FLOW_DISSECTOR_KEY_TIPC, .offset = offsetof(struct flow_keys, addrs.tipckey), }, { .key_id = FLOW_DISSECTOR_KEY_PORTS, .offset = offsetof(struct flow_keys, ports), }, { .key_id = FLOW_DISSECTOR_KEY_VLAN, .offset = offsetof(struct flow_keys, vlan), }, { .key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL, .offset = offsetof(struct flow_keys, tags), }, { .key_id = FLOW_DISSECTOR_KEY_GRE_KEYID, .offset = offsetof(struct flow_keys, keyid), }, }; static const struct flow_dissector_key flow_keys_dissector_symmetric_keys[] = { { .key_id = FLOW_DISSECTOR_KEY_CONTROL, .offset = offsetof(struct flow_keys, control), }, { .key_id = FLOW_DISSECTOR_KEY_BASIC, .offset = offsetof(struct flow_keys, basic), }, { .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS, .offset = offsetof(struct flow_keys, addrs.v4addrs), }, { .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS, .offset = offsetof(struct flow_keys, addrs.v6addrs), }, { .key_id = FLOW_DISSECTOR_KEY_PORTS, .offset = offsetof(struct flow_keys, ports), }, }; static const struct flow_dissector_key flow_keys_basic_dissector_keys[] = { { .key_id = FLOW_DISSECTOR_KEY_CONTROL, .offset = offsetof(struct flow_keys, control), }, { .key_id = FLOW_DISSECTOR_KEY_BASIC, .offset = offsetof(struct flow_keys, basic), }, }; struct flow_dissector flow_keys_dissector __read_mostly; EXPORT_SYMBOL(flow_keys_dissector); struct flow_dissector flow_keys_basic_dissector __read_mostly; EXPORT_SYMBOL(flow_keys_basic_dissector); static int __init init_default_flow_dissectors(void) { skb_flow_dissector_init(&flow_keys_dissector, flow_keys_dissector_keys, ARRAY_SIZE(flow_keys_dissector_keys)); skb_flow_dissector_init(&flow_keys_dissector_symmetric, flow_keys_dissector_symmetric_keys, ARRAY_SIZE(flow_keys_dissector_symmetric_keys)); skb_flow_dissector_init(&flow_keys_basic_dissector, flow_keys_basic_dissector_keys, ARRAY_SIZE(flow_keys_basic_dissector_keys)); return 0; } core_initcall(init_default_flow_dissectors);
313 100 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NF_NAT_H #define _NF_NAT_H #include <linux/list.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter/nf_conntrack_pptp.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_conntrack_tuple.h> #include <uapi/linux/netfilter/nf_nat.h> enum nf_nat_manip_type { NF_NAT_MANIP_SRC, NF_NAT_MANIP_DST }; /* SRC manip occurs POST_ROUTING or LOCAL_IN */ #define HOOK2MANIP(hooknum) ((hooknum) != NF_INET_POST_ROUTING && \ (hooknum) != NF_INET_LOCAL_IN) /* per conntrack: nat application helper private data */ union nf_conntrack_nat_help { /* insert nat helper private data here */ #if IS_ENABLED(CONFIG_NF_NAT_PPTP) struct nf_nat_pptp nat_pptp_info; #endif }; /* The structure embedded in the conntrack structure. */ struct nf_conn_nat { union nf_conntrack_nat_help help; #if IS_ENABLED(CONFIG_NF_NAT_MASQUERADE) int masq_index; #endif }; /* Set up the info structure to map into this range. */ unsigned int nf_nat_setup_info(struct nf_conn *ct, const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype); extern unsigned int nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum); struct nf_conn_nat *nf_ct_nat_ext_add(struct nf_conn *ct); static inline struct nf_conn_nat *nfct_nat(const struct nf_conn *ct) { #if IS_ENABLED(CONFIG_NF_NAT) return nf_ct_ext_find(ct, NF_CT_EXT_NAT); #else return NULL; #endif } static inline bool nf_nat_oif_changed(unsigned int hooknum, enum ip_conntrack_info ctinfo, struct nf_conn_nat *nat, const struct net_device *out) { #if IS_ENABLED(CONFIG_NF_NAT_MASQUERADE) return nat && nat->masq_index && hooknum == NF_INET_POST_ROUTING && CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL && nat->masq_index != out->ifindex; #else return false; #endif } int nf_nat_register_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, const struct nf_hook_ops *nat_ops, unsigned int ops_count); void nf_nat_unregister_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, unsigned int ops_count); unsigned int nf_nat_packet(struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int hooknum, struct sk_buff *skb); unsigned int nf_nat_manip_pkt(struct sk_buff *skb, struct nf_conn *ct, enum nf_nat_manip_type mtype, enum ip_conntrack_dir dir); void nf_nat_csum_recalc(struct sk_buff *skb, u8 nfproto, u8 proto, void *data, __sum16 *check, int datalen, int oldlen); int nf_nat_icmp_reply_translation(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int hooknum); int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int hooknum, unsigned int hdrlen); int nf_nat_ipv4_register_fn(struct net *net, const struct nf_hook_ops *ops); void nf_nat_ipv4_unregister_fn(struct net *net, const struct nf_hook_ops *ops); int nf_nat_ipv6_register_fn(struct net *net, const struct nf_hook_ops *ops); void nf_nat_ipv6_unregister_fn(struct net *net, const struct nf_hook_ops *ops); int nf_nat_inet_register_fn(struct net *net, const struct nf_hook_ops *ops); void nf_nat_inet_unregister_fn(struct net *net, const struct nf_hook_ops *ops); unsigned int nf_nat_inet_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); int nf_ct_nat(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, int *action, const struct nf_nat_range2 *range, bool commit); static inline int nf_nat_initialized(const struct nf_conn *ct, enum nf_nat_manip_type manip) { if (manip == NF_NAT_MANIP_SRC) return ct->status & IPS_SRC_NAT_DONE; else return ct->status & IPS_DST_NAT_DONE; } #endif
3 557 556 7 1400 1401 4 1525 1508 1509 25 1509 1522 8 8 8 8 8 1523 1521 1524 1402 1402 1400 1511 1525 1522 1403 1545 1505 1558 1561 556 491 538 557 537 493 590 573 538 1399 1402 1399 491 493 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 // SPDX-License-Identifier: GPL-2.0 /* * fs/sysfs/group.c - Operations for adding/removing multiple files at once. * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab * Copyright (c) 2013 Greg Kroah-Hartman * Copyright (c) 2013 The Linux Foundation */ #include <linux/kobject.h> #include <linux/module.h> #include <linux/dcache.h> #include <linux/namei.h> #include <linux/err.h> #include <linux/fs.h> #include "sysfs.h" static void remove_files(struct kernfs_node *parent, const struct attribute_group *grp) { struct attribute *const *attr; struct bin_attribute *const *bin_attr; if (grp->attrs) for (attr = grp->attrs; *attr; attr++) kernfs_remove_by_name(parent, (*attr)->name); if (grp->bin_attrs) for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) kernfs_remove_by_name(parent, (*bin_attr)->attr.name); } static umode_t __first_visible(const struct attribute_group *grp, struct kobject *kobj) { if (grp->attrs && grp->attrs[0] && grp->is_visible) return grp->is_visible(kobj, grp->attrs[0], 0); if (grp->bin_attrs && grp->bin_attrs[0] && grp->is_bin_visible) return grp->is_bin_visible(kobj, grp->bin_attrs[0], 0); return 0; } static int create_files(struct kernfs_node *parent, struct kobject *kobj, kuid_t uid, kgid_t gid, const struct attribute_group *grp, int update) { struct attribute *const *attr; struct bin_attribute *const *bin_attr; int error = 0, i; if (grp->attrs) { for (i = 0, attr = grp->attrs; *attr && !error; i++, attr++) { umode_t mode = (*attr)->mode; /* * In update mode, we're changing the permissions or * visibility. Do this by first removing then * re-adding (if required) the file. */ if (update) kernfs_remove_by_name(parent, (*attr)->name); if (grp->is_visible) { mode = grp->is_visible(kobj, *attr, i); mode &= ~SYSFS_GROUP_INVISIBLE; if (!mode) continue; } WARN(mode & ~(SYSFS_PREALLOC | 0664), "Attribute %s: Invalid permissions 0%o\n", (*attr)->name, mode); mode &= SYSFS_PREALLOC | 0664; error = sysfs_add_file_mode_ns(parent, *attr, mode, uid, gid, NULL); if (unlikely(error)) break; } if (error) { remove_files(parent, grp); goto exit; } } if (grp->bin_attrs) { for (i = 0, bin_attr = grp->bin_attrs; *bin_attr; i++, bin_attr++) { umode_t mode = (*bin_attr)->attr.mode; size_t size = (*bin_attr)->size; if (update) kernfs_remove_by_name(parent, (*bin_attr)->attr.name); if (grp->is_bin_visible) { mode = grp->is_bin_visible(kobj, *bin_attr, i); mode &= ~SYSFS_GROUP_INVISIBLE; if (!mode) continue; } if (grp->bin_size) size = grp->bin_size(kobj, *bin_attr, i); WARN(mode & ~(SYSFS_PREALLOC | 0664), "Attribute %s: Invalid permissions 0%o\n", (*bin_attr)->attr.name, mode); mode &= SYSFS_PREALLOC | 0664; error = sysfs_add_bin_file_mode_ns(parent, *bin_attr, mode, size, uid, gid, NULL); if (error) break; } if (error) remove_files(parent, grp); } exit: return error; } static int internal_create_group(struct kobject *kobj, int update, const struct attribute_group *grp) { struct kernfs_node *kn; kuid_t uid; kgid_t gid; int error; if (WARN_ON(!kobj || (!update && !kobj->sd))) return -EINVAL; /* Updates may happen before the object has been instantiated */ if (unlikely(update && !kobj->sd)) return -EINVAL; if (!grp->attrs && !grp->bin_attrs) { pr_debug("sysfs: (bin_)attrs not set by subsystem for group: %s/%s, skipping\n", kobj->name, grp->name ?: ""); return 0; } kobject_get_ownership(kobj, &uid, &gid); if (grp->name) { umode_t mode = __first_visible(grp, kobj); if (mode & SYSFS_GROUP_INVISIBLE) mode = 0; else mode = S_IRWXU | S_IRUGO | S_IXUGO; if (update) { kn = kernfs_find_and_get(kobj->sd, grp->name); if (!kn) { pr_debug("attr grp %s/%s not created yet\n", kobj->name, grp->name); /* may have been invisible prior to this update */ update = 0; } else if (!mode) { sysfs_remove_group(kobj, grp); kernfs_put(kn); return 0; } } if (!update) { if (!mode) return 0; kn = kernfs_create_dir_ns(kobj->sd, grp->name, mode, uid, gid, kobj, NULL); if (IS_ERR(kn)) { if (PTR_ERR(kn) == -EEXIST) sysfs_warn_dup(kobj->sd, grp->name); return PTR_ERR(kn); } } } else { kn = kobj->sd; } kernfs_get(kn); error = create_files(kn, kobj, uid, gid, grp, update); if (error) { if (grp->name) kernfs_remove(kn); } kernfs_put(kn); if (grp->name && update) kernfs_put(kn); return error; } /** * sysfs_create_group - given a directory kobject, create an attribute group * @kobj: The kobject to create the group on * @grp: The attribute group to create * * This function creates a group for the first time. It will explicitly * warn and error if any of the attribute files being created already exist. * * Returns 0 on success or error code on failure. */ int sysfs_create_group(struct kobject *kobj, const struct attribute_group *grp) { return internal_create_group(kobj, 0, grp); } EXPORT_SYMBOL_GPL(sysfs_create_group); static int internal_create_groups(struct kobject *kobj, int update, const struct attribute_group **groups) { int error = 0; int i; if (!groups) return 0; for (i = 0; groups[i]; i++) { error = internal_create_group(kobj, update, groups[i]); if (error) { while (--i >= 0) sysfs_remove_group(kobj, groups[i]); break; } } return error; } /** * sysfs_create_groups - given a directory kobject, create a bunch of attribute groups * @kobj: The kobject to create the group on * @groups: The attribute groups to create, NULL terminated * * This function creates a bunch of attribute groups. If an error occurs when * creating a group, all previously created groups will be removed, unwinding * everything back to the original state when this function was called. * It will explicitly warn and error if any of the attribute files being * created already exist. * * Returns 0 on success or error code from sysfs_create_group on failure. */ int sysfs_create_groups(struct kobject *kobj, const struct attribute_group **groups) { return internal_create_groups(kobj, 0, groups); } EXPORT_SYMBOL_GPL(sysfs_create_groups); /** * sysfs_update_groups - given a directory kobject, create a bunch of attribute groups * @kobj: The kobject to update the group on * @groups: The attribute groups to update, NULL terminated * * This function update a bunch of attribute groups. If an error occurs when * updating a group, all previously updated groups will be removed together * with already existing (not updated) attributes. * * Returns 0 on success or error code from sysfs_update_group on failure. */ int sysfs_update_groups(struct kobject *kobj, const struct attribute_group **groups) { return internal_create_groups(kobj, 1, groups); } EXPORT_SYMBOL_GPL(sysfs_update_groups); /** * sysfs_update_group - given a directory kobject, update an attribute group * @kobj: The kobject to update the group on * @grp: The attribute group to update * * This function updates an attribute group. Unlike * sysfs_create_group(), it will explicitly not warn or error if any * of the attribute files being created already exist. Furthermore, * if the visibility of the files has changed through the is_visible() * callback, it will update the permissions and add or remove the * relevant files. Changing a group's name (subdirectory name under * kobj's directory in sysfs) is not allowed. * * The primary use for this function is to call it after making a change * that affects group visibility. * * Returns 0 on success or error code on failure. */ int sysfs_update_group(struct kobject *kobj, const struct attribute_group *grp) { return internal_create_group(kobj, 1, grp); } EXPORT_SYMBOL_GPL(sysfs_update_group); /** * sysfs_remove_group: remove a group from a kobject * @kobj: kobject to remove the group from * @grp: group to remove * * This function removes a group of attributes from a kobject. The attributes * previously have to have been created for this group, otherwise it will fail. */ void sysfs_remove_group(struct kobject *kobj, const struct attribute_group *grp) { struct kernfs_node *parent = kobj->sd; struct kernfs_node *kn; if (grp->name) { kn = kernfs_find_and_get(parent, grp->name); if (!kn) { pr_debug("sysfs group '%s' not found for kobject '%s'\n", grp->name, kobject_name(kobj)); return; } } else { kn = parent; kernfs_get(kn); } remove_files(kn, grp); if (grp->name) kernfs_remove(kn); kernfs_put(kn); } EXPORT_SYMBOL_GPL(sysfs_remove_group); /** * sysfs_remove_groups - remove a list of groups * * @kobj: The kobject for the groups to be removed from * @groups: NULL terminated list of groups to be removed * * If groups is not NULL, remove the specified groups from the kobject. */ void sysfs_remove_groups(struct kobject *kobj, const struct attribute_group **groups) { int i; if (!groups) return; for (i = 0; groups[i]; i++) sysfs_remove_group(kobj, groups[i]); } EXPORT_SYMBOL_GPL(sysfs_remove_groups); /** * sysfs_merge_group - merge files into a pre-existing named attribute group. * @kobj: The kobject containing the group. * @grp: The files to create and the attribute group they belong to. * * This function returns an error if the group doesn't exist, the .name field is * NULL or any of the files already exist in that group, in which case none of * the new files are created. */ int sysfs_merge_group(struct kobject *kobj, const struct attribute_group *grp) { struct kernfs_node *parent; kuid_t uid; kgid_t gid; int error = 0; struct attribute *const *attr; int i; parent = kernfs_find_and_get(kobj->sd, grp->name); if (!parent) return -ENOENT; kobject_get_ownership(kobj, &uid, &gid); for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr)) error = sysfs_add_file_mode_ns(parent, *attr, (*attr)->mode, uid, gid, NULL); if (error) { while (--i >= 0) kernfs_remove_by_name(parent, (*--attr)->name); } kernfs_put(parent); return error; } EXPORT_SYMBOL_GPL(sysfs_merge_group); /** * sysfs_unmerge_group - remove files from a pre-existing named attribute group. * @kobj: The kobject containing the group. * @grp: The files to remove and the attribute group they belong to. */ void sysfs_unmerge_group(struct kobject *kobj, const struct attribute_group *grp) { struct kernfs_node *parent; struct attribute *const *attr; parent = kernfs_find_and_get(kobj->sd, grp->name); if (parent) { for (attr = grp->attrs; *attr; ++attr) kernfs_remove_by_name(parent, (*attr)->name); kernfs_put(parent); } } EXPORT_SYMBOL_GPL(sysfs_unmerge_group); /** * sysfs_add_link_to_group - add a symlink to an attribute group. * @kobj: The kobject containing the group. * @group_name: The name of the group. * @target: The target kobject of the symlink to create. * @link_name: The name of the symlink to create. */ int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name, struct kobject *target, const char *link_name) { struct kernfs_node *parent; int error = 0; parent = kernfs_find_and_get(kobj->sd, group_name); if (!parent) return -ENOENT; error = sysfs_create_link_sd(parent, target, link_name); kernfs_put(parent); return error; } EXPORT_SYMBOL_GPL(sysfs_add_link_to_group); /** * sysfs_remove_link_from_group - remove a symlink from an attribute group. * @kobj: The kobject containing the group. * @group_name: The name of the group. * @link_name: The name of the symlink to remove. */ void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, const char *link_name) { struct kernfs_node *parent; parent = kernfs_find_and_get(kobj->sd, group_name); if (parent) { kernfs_remove_by_name(parent, link_name); kernfs_put(parent); } } EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group); /** * compat_only_sysfs_link_entry_to_kobj - add a symlink to a kobject pointing * to a group or an attribute * @kobj: The kobject containing the group. * @target_kobj: The target kobject. * @target_name: The name of the target group or attribute. * @symlink_name: The name of the symlink file (target_name will be * considered if symlink_name is NULL). */ int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj, struct kobject *target_kobj, const char *target_name, const char *symlink_name) { struct kernfs_node *target; struct kernfs_node *entry; struct kernfs_node *link; /* * We don't own @target_kobj and it may be removed at any time. * Synchronize using sysfs_symlink_target_lock. See sysfs_remove_dir() * for details. */ spin_lock(&sysfs_symlink_target_lock); target = target_kobj->sd; if (target) kernfs_get(target); spin_unlock(&sysfs_symlink_target_lock); if (!target) return -ENOENT; entry = kernfs_find_and_get(target, target_name); if (!entry) { kernfs_put(target); return -ENOENT; } if (!symlink_name) symlink_name = target_name; link = kernfs_create_link(kobj->sd, symlink_name, entry); if (PTR_ERR(link) == -EEXIST) sysfs_warn_dup(kobj->sd, symlink_name); kernfs_put(entry); kernfs_put(target); return PTR_ERR_OR_ZERO(link); } EXPORT_SYMBOL_GPL(compat_only_sysfs_link_entry_to_kobj); static int sysfs_group_attrs_change_owner(struct kernfs_node *grp_kn, const struct attribute_group *grp, struct iattr *newattrs) { struct kernfs_node *kn; int error; if (grp->attrs) { struct attribute *const *attr; for (attr = grp->attrs; *attr; attr++) { kn = kernfs_find_and_get(grp_kn, (*attr)->name); if (!kn) return -ENOENT; error = kernfs_setattr(kn, newattrs); kernfs_put(kn); if (error) return error; } } if (grp->bin_attrs) { struct bin_attribute *const *bin_attr; for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) { kn = kernfs_find_and_get(grp_kn, (*bin_attr)->attr.name); if (!kn) return -ENOENT; error = kernfs_setattr(kn, newattrs); kernfs_put(kn); if (error) return error; } } return 0; } /** * sysfs_group_change_owner - change owner of an attribute group. * @kobj: The kobject containing the group. * @grp: The attribute group. * @kuid: new owner's kuid * @kgid: new owner's kgid * * Returns 0 on success or error code on failure. */ int sysfs_group_change_owner(struct kobject *kobj, const struct attribute_group *grp, kuid_t kuid, kgid_t kgid) { struct kernfs_node *grp_kn; int error; struct iattr newattrs = { .ia_valid = ATTR_UID | ATTR_GID, .ia_uid = kuid, .ia_gid = kgid, }; if (!kobj->state_in_sysfs) return -EINVAL; if (grp->name) { grp_kn = kernfs_find_and_get(kobj->sd, grp->name); } else { kernfs_get(kobj->sd); grp_kn = kobj->sd; } if (!grp_kn) return -ENOENT; error = kernfs_setattr(grp_kn, &newattrs); if (!error) error = sysfs_group_attrs_change_owner(grp_kn, grp, &newattrs); kernfs_put(grp_kn); return error; } EXPORT_SYMBOL_GPL(sysfs_group_change_owner); /** * sysfs_groups_change_owner - change owner of a set of attribute groups. * @kobj: The kobject containing the groups. * @groups: The attribute groups. * @kuid: new owner's kuid * @kgid: new owner's kgid * * Returns 0 on success or error code on failure. */ int sysfs_groups_change_owner(struct kobject *kobj, const struct attribute_group **groups, kuid_t kuid, kgid_t kgid) { int error = 0, i; if (!kobj->state_in_sysfs) return -EINVAL; if (!groups) return 0; for (i = 0; groups[i]; i++) { error = sysfs_group_change_owner(kobj, groups[i], kuid, kgid); if (error) break; } return error; } EXPORT_SYMBOL_GPL(sysfs_groups_change_owner);
5595 5487 206 5595 5538 761 760 758 760 760 755 6 751 752 5 12 12 438 438 257 436 436 10 4993 4997 1191 5317 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 #ifndef _LINUX_MMAP_LOCK_H #define _LINUX_MMAP_LOCK_H #include <linux/lockdep.h> #include <linux/mm_types.h> #include <linux/mmdebug.h> #include <linux/rwsem.h> #include <linux/tracepoint-defs.h> #include <linux/types.h> #define MMAP_LOCK_INITIALIZER(name) \ .mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock), DECLARE_TRACEPOINT(mmap_lock_start_locking); DECLARE_TRACEPOINT(mmap_lock_acquire_returned); DECLARE_TRACEPOINT(mmap_lock_released); #ifdef CONFIG_TRACING void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write); void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write, bool success); void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write); static inline void __mmap_lock_trace_start_locking(struct mm_struct *mm, bool write) { if (tracepoint_enabled(mmap_lock_start_locking)) __mmap_lock_do_trace_start_locking(mm, write); } static inline void __mmap_lock_trace_acquire_returned(struct mm_struct *mm, bool write, bool success) { if (tracepoint_enabled(mmap_lock_acquire_returned)) __mmap_lock_do_trace_acquire_returned(mm, write, success); } static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write) { if (tracepoint_enabled(mmap_lock_released)) __mmap_lock_do_trace_released(mm, write); } #else /* !CONFIG_TRACING */ static inline void __mmap_lock_trace_start_locking(struct mm_struct *mm, bool write) { } static inline void __mmap_lock_trace_acquire_returned(struct mm_struct *mm, bool write, bool success) { } static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write) { } #endif /* CONFIG_TRACING */ static inline void mmap_assert_locked(const struct mm_struct *mm) { rwsem_assert_held(&mm->mmap_lock); } static inline void mmap_assert_write_locked(const struct mm_struct *mm) { rwsem_assert_held_write(&mm->mmap_lock); } #ifdef CONFIG_PER_VMA_LOCK static inline void mm_lock_seqcount_init(struct mm_struct *mm) { seqcount_init(&mm->mm_lock_seq); } static inline void mm_lock_seqcount_begin(struct mm_struct *mm) { do_raw_write_seqcount_begin(&mm->mm_lock_seq); } static inline void mm_lock_seqcount_end(struct mm_struct *mm) { ASSERT_EXCLUSIVE_WRITER(mm->mm_lock_seq); do_raw_write_seqcount_end(&mm->mm_lock_seq); } static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq) { /* * Since mmap_lock is a sleeping lock, and waiting for it to become * unlocked is more or less equivalent with taking it ourselves, don't * bother with the speculative path if mmap_lock is already write-locked * and take the slow path, which takes the lock. */ return raw_seqcount_try_begin(&mm->mm_lock_seq, *seq); } static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq) { return read_seqcount_retry(&mm->mm_lock_seq, seq); } #else /* CONFIG_PER_VMA_LOCK */ static inline void mm_lock_seqcount_init(struct mm_struct *mm) {} static inline void mm_lock_seqcount_begin(struct mm_struct *mm) {} static inline void mm_lock_seqcount_end(struct mm_struct *mm) {} static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq) { return false; } static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq) { return true; } #endif /* CONFIG_PER_VMA_LOCK */ static inline void mmap_write_lock(struct mm_struct *mm) { __mmap_lock_trace_start_locking(mm, true); down_write(&mm->mmap_lock); mm_lock_seqcount_begin(mm); __mmap_lock_trace_acquire_returned(mm, true, true); } static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass) { __mmap_lock_trace_start_locking(mm, true); down_write_nested(&mm->mmap_lock, subclass); mm_lock_seqcount_begin(mm); __mmap_lock_trace_acquire_returned(mm, true, true); } static inline int mmap_write_lock_killable(struct mm_struct *mm) { int ret; __mmap_lock_trace_start_locking(mm, true); ret = down_write_killable(&mm->mmap_lock); if (!ret) mm_lock_seqcount_begin(mm); __mmap_lock_trace_acquire_returned(mm, true, ret == 0); return ret; } /* * Drop all currently-held per-VMA locks. * This is called from the mmap_lock implementation directly before releasing * a write-locked mmap_lock (or downgrading it to read-locked). * This should normally NOT be called manually from other places. * If you want to call this manually anyway, keep in mind that this will release * *all* VMA write locks, including ones from further up the stack. */ static inline void vma_end_write_all(struct mm_struct *mm) { mmap_assert_write_locked(mm); mm_lock_seqcount_end(mm); } static inline void mmap_write_unlock(struct mm_struct *mm) { __mmap_lock_trace_released(mm, true); vma_end_write_all(mm); up_write(&mm->mmap_lock); } static inline void mmap_write_downgrade(struct mm_struct *mm) { __mmap_lock_trace_acquire_returned(mm, false, true); vma_end_write_all(mm); downgrade_write(&mm->mmap_lock); } static inline void mmap_read_lock(struct mm_struct *mm) { __mmap_lock_trace_start_locking(mm, false); down_read(&mm->mmap_lock); __mmap_lock_trace_acquire_returned(mm, false, true); } static inline int mmap_read_lock_killable(struct mm_struct *mm) { int ret; __mmap_lock_trace_start_locking(mm, false); ret = down_read_killable(&mm->mmap_lock); __mmap_lock_trace_acquire_returned(mm, false, ret == 0); return ret; } static inline bool mmap_read_trylock(struct mm_struct *mm) { bool ret; __mmap_lock_trace_start_locking(mm, false); ret = down_read_trylock(&mm->mmap_lock) != 0; __mmap_lock_trace_acquire_returned(mm, false, ret); return ret; } static inline void mmap_read_unlock(struct mm_struct *mm) { __mmap_lock_trace_released(mm, false); up_read(&mm->mmap_lock); } static inline void mmap_read_unlock_non_owner(struct mm_struct *mm) { __mmap_lock_trace_released(mm, false); up_read_non_owner(&mm->mmap_lock); } static inline int mmap_lock_is_contended(struct mm_struct *mm) { return rwsem_is_contended(&mm->mmap_lock); } #endif /* _LINUX_MMAP_LOCK_H */
136 6 136 37 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_GENERIC_HUGETLB_H #define _ASM_GENERIC_HUGETLB_H #include <linux/swap.h> #include <linux/swapops.h> static inline pte_t mk_huge_pte(struct page *page, pgprot_t pgprot) { return mk_pte(page, pgprot); } static inline unsigned long huge_pte_write(pte_t pte) { return pte_write(pte); } static inline unsigned long huge_pte_dirty(pte_t pte) { return pte_dirty(pte); } static inline pte_t huge_pte_mkwrite(pte_t pte) { return pte_mkwrite_novma(pte); } #ifndef __HAVE_ARCH_HUGE_PTE_WRPROTECT static inline pte_t huge_pte_wrprotect(pte_t pte) { return pte_wrprotect(pte); } #endif static inline pte_t huge_pte_mkdirty(pte_t pte) { return pte_mkdirty(pte); } static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot) { return pte_modify(pte, newprot); } #ifndef __HAVE_ARCH_HUGE_PTE_MKUFFD_WP static inline pte_t huge_pte_mkuffd_wp(pte_t pte) { return huge_pte_wrprotect(pte_mkuffd_wp(pte)); } #endif #ifndef __HAVE_ARCH_HUGE_PTE_CLEAR_UFFD_WP static inline pte_t huge_pte_clear_uffd_wp(pte_t pte) { return pte_clear_uffd_wp(pte); } #endif #ifndef __HAVE_ARCH_HUGE_PTE_UFFD_WP static inline int huge_pte_uffd_wp(pte_t pte) { return pte_uffd_wp(pte); } #endif #ifndef __HAVE_ARCH_HUGE_PTE_CLEAR static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long sz) { pte_clear(mm, addr, ptep); } #endif #ifndef __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { free_pgd_range(tlb, addr, end, floor, ceiling); } #endif #ifndef __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned long sz) { set_pte_at(mm, addr, ptep, pte); } #endif #ifndef __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long sz) { return ptep_get_and_clear(mm, addr, ptep); } #endif #ifndef __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { return ptep_clear_flush(vma, addr, ptep); } #endif #ifndef __HAVE_ARCH_HUGE_PTE_NONE static inline int huge_pte_none(pte_t pte) { return pte_none(pte); } #endif /* Please refer to comments above pte_none_mostly() for the usage */ #ifndef __HAVE_ARCH_HUGE_PTE_NONE_MOSTLY static inline int huge_pte_none_mostly(pte_t pte) { return huge_pte_none(pte) || is_pte_marker(pte); } #endif #ifndef __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE static inline int prepare_hugepage_range(struct file *file, unsigned long addr, unsigned long len) { return 0; } #endif #ifndef __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { ptep_set_wrprotect(mm, addr, ptep); } #endif #ifndef __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty) { return ptep_set_access_flags(vma, addr, ptep, pte, dirty); } #endif #ifndef __HAVE_ARCH_HUGE_PTEP_GET static inline pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { return ptep_get(ptep); } #endif #ifndef __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED static inline bool gigantic_page_runtime_supported(void) { return IS_ENABLED(CONFIG_ARCH_HAS_GIGANTIC_PAGE); } #endif /* __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED */ #endif /* _ASM_GENERIC_HUGETLB_H */
4353 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * include/net/dsa.h - Driver for Distributed Switch Architecture switch chips * Copyright (c) 2008-2009 Marvell Semiconductor */ #ifndef __LINUX_NET_DSA_H #define __LINUX_NET_DSA_H #include <linux/if.h> #include <linux/if_ether.h> #include <linux/list.h> #include <linux/notifier.h> #include <linux/timer.h> #include <linux/workqueue.h> #include <linux/of.h> #include <linux/ethtool.h> #include <linux/net_tstamp.h> #include <linux/phy.h> #include <linux/platform_data/dsa.h> #include <linux/phylink.h> #include <net/devlink.h> #include <net/switchdev.h> struct dsa_8021q_context; struct tc_action; #define DSA_TAG_PROTO_NONE_VALUE 0 #define DSA_TAG_PROTO_BRCM_VALUE 1 #define DSA_TAG_PROTO_BRCM_PREPEND_VALUE 2 #define DSA_TAG_PROTO_DSA_VALUE 3 #define DSA_TAG_PROTO_EDSA_VALUE 4 #define DSA_TAG_PROTO_GSWIP_VALUE 5 #define DSA_TAG_PROTO_KSZ9477_VALUE 6 #define DSA_TAG_PROTO_KSZ9893_VALUE 7 #define DSA_TAG_PROTO_LAN9303_VALUE 8 #define DSA_TAG_PROTO_MTK_VALUE 9 #define DSA_TAG_PROTO_QCA_VALUE 10 #define DSA_TAG_PROTO_TRAILER_VALUE 11 #define DSA_TAG_PROTO_8021Q_VALUE 12 #define DSA_TAG_PROTO_SJA1105_VALUE 13 #define DSA_TAG_PROTO_KSZ8795_VALUE 14 #define DSA_TAG_PROTO_OCELOT_VALUE 15 #define DSA_TAG_PROTO_AR9331_VALUE 16 #define DSA_TAG_PROTO_RTL4_A_VALUE 17 #define DSA_TAG_PROTO_HELLCREEK_VALUE 18 #define DSA_TAG_PROTO_XRS700X_VALUE 19 #define DSA_TAG_PROTO_OCELOT_8021Q_VALUE 20 #define DSA_TAG_PROTO_SEVILLE_VALUE 21 #define DSA_TAG_PROTO_BRCM_LEGACY_VALUE 22 #define DSA_TAG_PROTO_SJA1110_VALUE 23 #define DSA_TAG_PROTO_RTL8_4_VALUE 24 #define DSA_TAG_PROTO_RTL8_4T_VALUE 25 #define DSA_TAG_PROTO_RZN1_A5PSW_VALUE 26 #define DSA_TAG_PROTO_LAN937X_VALUE 27 #define DSA_TAG_PROTO_VSC73XX_8021Q_VALUE 28 enum dsa_tag_protocol { DSA_TAG_PROTO_NONE = DSA_TAG_PROTO_NONE_VALUE, DSA_TAG_PROTO_BRCM = DSA_TAG_PROTO_BRCM_VALUE, DSA_TAG_PROTO_BRCM_LEGACY = DSA_TAG_PROTO_BRCM_LEGACY_VALUE, DSA_TAG_PROTO_BRCM_PREPEND = DSA_TAG_PROTO_BRCM_PREPEND_VALUE, DSA_TAG_PROTO_DSA = DSA_TAG_PROTO_DSA_VALUE, DSA_TAG_PROTO_EDSA = DSA_TAG_PROTO_EDSA_VALUE, DSA_TAG_PROTO_GSWIP = DSA_TAG_PROTO_GSWIP_VALUE, DSA_TAG_PROTO_KSZ9477 = DSA_TAG_PROTO_KSZ9477_VALUE, DSA_TAG_PROTO_KSZ9893 = DSA_TAG_PROTO_KSZ9893_VALUE, DSA_TAG_PROTO_LAN9303 = DSA_TAG_PROTO_LAN9303_VALUE, DSA_TAG_PROTO_MTK = DSA_TAG_PROTO_MTK_VALUE, DSA_TAG_PROTO_QCA = DSA_TAG_PROTO_QCA_VALUE, DSA_TAG_PROTO_TRAILER = DSA_TAG_PROTO_TRAILER_VALUE, DSA_TAG_PROTO_8021Q = DSA_TAG_PROTO_8021Q_VALUE, DSA_TAG_PROTO_SJA1105 = DSA_TAG_PROTO_SJA1105_VALUE, DSA_TAG_PROTO_KSZ8795 = DSA_TAG_PROTO_KSZ8795_VALUE, DSA_TAG_PROTO_OCELOT = DSA_TAG_PROTO_OCELOT_VALUE, DSA_TAG_PROTO_AR9331 = DSA_TAG_PROTO_AR9331_VALUE, DSA_TAG_PROTO_RTL4_A = DSA_TAG_PROTO_RTL4_A_VALUE, DSA_TAG_PROTO_HELLCREEK = DSA_TAG_PROTO_HELLCREEK_VALUE, DSA_TAG_PROTO_XRS700X = DSA_TAG_PROTO_XRS700X_VALUE, DSA_TAG_PROTO_OCELOT_8021Q = DSA_TAG_PROTO_OCELOT_8021Q_VALUE, DSA_TAG_PROTO_SEVILLE = DSA_TAG_PROTO_SEVILLE_VALUE, DSA_TAG_PROTO_SJA1110 = DSA_TAG_PROTO_SJA1110_VALUE, DSA_TAG_PROTO_RTL8_4 = DSA_TAG_PROTO_RTL8_4_VALUE, DSA_TAG_PROTO_RTL8_4T = DSA_TAG_PROTO_RTL8_4T_VALUE, DSA_TAG_PROTO_RZN1_A5PSW = DSA_TAG_PROTO_RZN1_A5PSW_VALUE, DSA_TAG_PROTO_LAN937X = DSA_TAG_PROTO_LAN937X_VALUE, DSA_TAG_PROTO_VSC73XX_8021Q = DSA_TAG_PROTO_VSC73XX_8021Q_VALUE, }; struct dsa_switch; struct dsa_device_ops { struct sk_buff *(*xmit)(struct sk_buff *skb, struct net_device *dev); struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev); void (*flow_dissect)(const struct sk_buff *skb, __be16 *proto, int *offset); int (*connect)(struct dsa_switch *ds); void (*disconnect)(struct dsa_switch *ds); unsigned int needed_headroom; unsigned int needed_tailroom; const char *name; enum dsa_tag_protocol proto; /* Some tagging protocols either mangle or shift the destination MAC * address, in which case the DSA conduit would drop packets on ingress * if what it understands out of the destination MAC address is not in * its RX filter. */ bool promisc_on_conduit; }; struct dsa_lag { struct net_device *dev; unsigned int id; struct mutex fdb_lock; struct list_head fdbs; refcount_t refcount; }; struct dsa_switch_tree { struct list_head list; /* List of switch ports */ struct list_head ports; /* Notifier chain for switch-wide events */ struct raw_notifier_head nh; /* Tree identifier */ unsigned int index; /* Number of switches attached to this tree */ struct kref refcount; /* Maps offloaded LAG netdevs to a zero-based linear ID for * drivers that need it. */ struct dsa_lag **lags; /* Tagging protocol operations */ const struct dsa_device_ops *tag_ops; /* Default tagging protocol preferred by the switches in this * tree. */ enum dsa_tag_protocol default_proto; /* Has this tree been applied to the hardware? */ bool setup; /* * Configuration data for the platform device that owns * this dsa switch tree instance. */ struct dsa_platform_data *pd; /* List of DSA links composing the routing table */ struct list_head rtable; /* Length of "lags" array */ unsigned int lags_len; /* Track the largest switch index within a tree */ unsigned int last_switch; }; /* LAG IDs are one-based, the dst->lags array is zero-based */ #define dsa_lags_foreach_id(_id, _dst) \ for ((_id) = 1; (_id) <= (_dst)->lags_len; (_id)++) \ if ((_dst)->lags[(_id) - 1]) #define dsa_lag_foreach_port(_dp, _dst, _lag) \ list_for_each_entry((_dp), &(_dst)->ports, list) \ if (dsa_port_offloads_lag((_dp), (_lag))) #define dsa_hsr_foreach_port(_dp, _ds, _hsr) \ list_for_each_entry((_dp), &(_ds)->dst->ports, list) \ if ((_dp)->ds == (_ds) && (_dp)->hsr_dev == (_hsr)) static inline struct dsa_lag *dsa_lag_by_id(struct dsa_switch_tree *dst, unsigned int id) { /* DSA LAG IDs are one-based, dst->lags is zero-based */ return dst->lags[id - 1]; } static inline int dsa_lag_id(struct dsa_switch_tree *dst, struct net_device *lag_dev) { unsigned int id; dsa_lags_foreach_id(id, dst) { struct dsa_lag *lag = dsa_lag_by_id(dst, id); if (lag->dev == lag_dev) return lag->id; } return -ENODEV; } /* TC matchall action types */ enum dsa_port_mall_action_type { DSA_PORT_MALL_MIRROR, DSA_PORT_MALL_POLICER, }; /* TC mirroring entry */ struct dsa_mall_mirror_tc_entry { u8 to_local_port; bool ingress; }; /* TC port policer entry */ struct dsa_mall_policer_tc_entry { u32 burst; u64 rate_bytes_per_sec; }; /* TC matchall entry */ struct dsa_mall_tc_entry { struct list_head list; unsigned long cookie; enum dsa_port_mall_action_type type; union { struct dsa_mall_mirror_tc_entry mirror; struct dsa_mall_policer_tc_entry policer; }; }; struct dsa_bridge { struct net_device *dev; unsigned int num; bool tx_fwd_offload; refcount_t refcount; }; struct dsa_port { /* A CPU port is physically connected to a conduit device. A user port * exposes a network device to user-space, called 'user' here. */ union { struct net_device *conduit; struct net_device *user; }; /* Copy of the tagging protocol operations, for quicker access * in the data path. Valid only for the CPU ports. */ const struct dsa_device_ops *tag_ops; /* Copies for faster access in conduit receive hot path */ struct dsa_switch_tree *dst; struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev); struct dsa_switch *ds; unsigned int index; enum { DSA_PORT_TYPE_UNUSED = 0, DSA_PORT_TYPE_CPU, DSA_PORT_TYPE_DSA, DSA_PORT_TYPE_USER, } type; const char *name; struct dsa_port *cpu_dp; u8 mac[ETH_ALEN]; u8 stp_state; /* Warning: the following bit fields are not atomic, and updating them * can only be done from code paths where concurrency is not possible * (probe time or under rtnl_lock). */ u8 vlan_filtering:1; /* Managed by DSA on user ports and by drivers on CPU and DSA ports */ u8 learning:1; u8 lag_tx_enabled:1; /* conduit state bits, valid only on CPU ports */ u8 conduit_admin_up:1; u8 conduit_oper_up:1; /* Valid only on user ports */ u8 cpu_port_in_lag:1; u8 setup:1; struct device_node *dn; unsigned int ageing_time; struct dsa_bridge *bridge; struct devlink_port devlink_port; struct phylink *pl; struct phylink_config pl_config; struct dsa_lag *lag; struct net_device *hsr_dev; struct list_head list; /* * Original copy of the conduit netdev ethtool_ops */ const struct ethtool_ops *orig_ethtool_ops; /* List of MAC addresses that must be forwarded on this port. * These are only valid on CPU ports and DSA links. */ struct mutex addr_lists_lock; struct list_head fdbs; struct list_head mdbs; struct mutex vlans_lock; union { /* List of VLANs that CPU and DSA ports are members of. * Access to this is serialized by the sleepable @vlans_lock. */ struct list_head vlans; /* List of VLANs that user ports are members of. * Access to this is serialized by netif_addr_lock_bh(). */ struct list_head user_vlans; }; }; static inline struct dsa_port * dsa_phylink_to_port(struct phylink_config *config) { return container_of(config, struct dsa_port, pl_config); } /* TODO: ideally DSA ports would have a single dp->link_dp member, * and no dst->rtable nor this struct dsa_link would be needed, * but this would require some more complex tree walking, * so keep it stupid at the moment and list them all. */ struct dsa_link { struct dsa_port *dp; struct dsa_port *link_dp; struct list_head list; }; enum dsa_db_type { DSA_DB_PORT, DSA_DB_LAG, DSA_DB_BRIDGE, }; struct dsa_db { enum dsa_db_type type; union { const struct dsa_port *dp; struct dsa_lag lag; struct dsa_bridge bridge; }; }; struct dsa_mac_addr { unsigned char addr[ETH_ALEN]; u16 vid; refcount_t refcount; struct list_head list; struct dsa_db db; }; struct dsa_vlan { u16 vid; refcount_t refcount; struct list_head list; }; struct dsa_switch { struct device *dev; /* * Parent switch tree, and switch index. */ struct dsa_switch_tree *dst; unsigned int index; /* Warning: the following bit fields are not atomic, and updating them * can only be done from code paths where concurrency is not possible * (probe time or under rtnl_lock). */ u32 setup:1; /* Disallow bridge core from requesting different VLAN awareness * settings on ports if not hardware-supported */ u32 vlan_filtering_is_global:1; /* Keep VLAN filtering enabled on ports not offloading any upper */ u32 needs_standalone_vlan_filtering:1; /* Pass .port_vlan_add and .port_vlan_del to drivers even for bridges * that have vlan_filtering=0. All drivers should ideally set this (and * then the option would get removed), but it is unknown whether this * would break things or not. */ u32 configure_vlan_while_not_filtering:1; /* Pop the default_pvid of VLAN-unaware bridge ports from tagged frames. * DEPRECATED: Do NOT set this field in new drivers. Instead look at * the dsa_software_vlan_untag() comments. */ u32 untag_bridge_pvid:1; /* Pop the default_pvid of VLAN-aware bridge ports from tagged frames. * Useful if the switch cannot preserve the VLAN tag as seen on the * wire for user port ingress, and chooses to send all frames as * VLAN-tagged to the CPU, including those which were originally * untagged. */ u32 untag_vlan_aware_bridge_pvid:1; /* Let DSA manage the FDB entries towards the * CPU, based on the software bridge database. */ u32 assisted_learning_on_cpu_port:1; /* In case vlan_filtering_is_global is set, the VLAN awareness state * should be retrieved from here and not from the per-port settings. */ u32 vlan_filtering:1; /* For switches that only have the MRU configurable. To ensure the * configured MTU is not exceeded, normalization of MRU on all bridged * interfaces is needed. */ u32 mtu_enforcement_ingress:1; /* Drivers that isolate the FDBs of multiple bridges must set this * to true to receive the bridge as an argument in .port_fdb_{add,del} * and .port_mdb_{add,del}. Otherwise, the bridge.num will always be * passed as zero. */ u32 fdb_isolation:1; /* Drivers that have global DSCP mapping settings must set this to * true to automatically apply the settings to all ports. */ u32 dscp_prio_mapping_is_global:1; /* Listener for switch fabric events */ struct notifier_block nb; /* * Give the switch driver somewhere to hang its private data * structure. */ void *priv; void *tagger_data; /* * Configuration data for this switch. */ struct dsa_chip_data *cd; /* * The switch operations. */ const struct dsa_switch_ops *ops; /* * Allow a DSA switch driver to override the phylink MAC ops */ const struct phylink_mac_ops *phylink_mac_ops; /* * User mii_bus and devices for the individual ports. */ u32 phys_mii_mask; struct mii_bus *user_mii_bus; /* Ageing Time limits in msecs */ unsigned int ageing_time_min; unsigned int ageing_time_max; /* Storage for drivers using tag_8021q */ struct dsa_8021q_context *tag_8021q_ctx; /* devlink used to represent this switch device */ struct devlink *devlink; /* Number of switch port queues */ unsigned int num_tx_queues; /* Drivers that benefit from having an ID associated with each * offloaded LAG should set this to the maximum number of * supported IDs. DSA will then maintain a mapping of _at * least_ these many IDs, accessible to drivers via * dsa_lag_id(). */ unsigned int num_lag_ids; /* Drivers that support bridge forwarding offload or FDB isolation * should set this to the maximum number of bridges spanning the same * switch tree (or all trees, in the case of cross-tree bridging * support) that can be offloaded. */ unsigned int max_num_bridges; unsigned int num_ports; }; static inline struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p) { struct dsa_switch_tree *dst = ds->dst; struct dsa_port *dp; list_for_each_entry(dp, &dst->ports, list) if (dp->ds == ds && dp->index == p) return dp; return NULL; } static inline bool dsa_port_is_dsa(struct dsa_port *port) { return port->type == DSA_PORT_TYPE_DSA; } static inline bool dsa_port_is_cpu(struct dsa_port *port) { return port->type == DSA_PORT_TYPE_CPU; } static inline bool dsa_port_is_user(struct dsa_port *dp) { return dp->type == DSA_PORT_TYPE_USER; } static inline bool dsa_port_is_unused(struct dsa_port *dp) { return dp->type == DSA_PORT_TYPE_UNUSED; } static inline bool dsa_port_conduit_is_operational(struct dsa_port *dp) { return dsa_port_is_cpu(dp) && dp->conduit_admin_up && dp->conduit_oper_up; } static inline bool dsa_is_unused_port(struct dsa_switch *ds, int p) { return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_UNUSED; } static inline bool dsa_is_cpu_port(struct dsa_switch *ds, int p) { return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_CPU; } static inline bool dsa_is_dsa_port(struct dsa_switch *ds, int p) { return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_DSA; } static inline bool dsa_is_user_port(struct dsa_switch *ds, int p) { return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_USER; } #define dsa_tree_for_each_user_port(_dp, _dst) \ list_for_each_entry((_dp), &(_dst)->ports, list) \ if (dsa_port_is_user((_dp))) #define dsa_tree_for_each_user_port_continue_reverse(_dp, _dst) \ list_for_each_entry_continue_reverse((_dp), &(_dst)->ports, list) \ if (dsa_port_is_user((_dp))) #define dsa_tree_for_each_cpu_port(_dp, _dst) \ list_for_each_entry((_dp), &(_dst)->ports, list) \ if (dsa_port_is_cpu((_dp))) #define dsa_switch_for_each_port(_dp, _ds) \ list_for_each_entry((_dp), &(_ds)->dst->ports, list) \ if ((_dp)->ds == (_ds)) #define dsa_switch_for_each_port_safe(_dp, _next, _ds) \ list_for_each_entry_safe((_dp), (_next), &(_ds)->dst->ports, list) \ if ((_dp)->ds == (_ds)) #define dsa_switch_for_each_port_continue_reverse(_dp, _ds) \ list_for_each_entry_continue_reverse((_dp), &(_ds)->dst->ports, list) \ if ((_dp)->ds == (_ds)) #define dsa_switch_for_each_available_port(_dp, _ds) \ dsa_switch_for_each_port((_dp), (_ds)) \ if (!dsa_port_is_unused((_dp))) #define dsa_switch_for_each_user_port(_dp, _ds) \ dsa_switch_for_each_port((_dp), (_ds)) \ if (dsa_port_is_user((_dp))) #define dsa_switch_for_each_user_port_continue_reverse(_dp, _ds) \ dsa_switch_for_each_port_continue_reverse((_dp), (_ds)) \ if (dsa_port_is_user((_dp))) #define dsa_switch_for_each_cpu_port(_dp, _ds) \ dsa_switch_for_each_port((_dp), (_ds)) \ if (dsa_port_is_cpu((_dp))) #define dsa_switch_for_each_cpu_port_continue_reverse(_dp, _ds) \ dsa_switch_for_each_port_continue_reverse((_dp), (_ds)) \ if (dsa_port_is_cpu((_dp))) static inline u32 dsa_user_ports(struct dsa_switch *ds) { struct dsa_port *dp; u32 mask = 0; dsa_switch_for_each_user_port(dp, ds) mask |= BIT(dp->index); return mask; } static inline u32 dsa_cpu_ports(struct dsa_switch *ds) { struct dsa_port *cpu_dp; u32 mask = 0; dsa_switch_for_each_cpu_port(cpu_dp, ds) mask |= BIT(cpu_dp->index); return mask; } /* Return the local port used to reach an arbitrary switch device */ static inline unsigned int dsa_routing_port(struct dsa_switch *ds, int device) { struct dsa_switch_tree *dst = ds->dst; struct dsa_link *dl; list_for_each_entry(dl, &dst->rtable, list) if (dl->dp->ds == ds && dl->link_dp->ds->index == device) return dl->dp->index; return ds->num_ports; } /* Return the local port used to reach an arbitrary switch port */ static inline unsigned int dsa_towards_port(struct dsa_switch *ds, int device, int port) { if (device == ds->index) return port; else return dsa_routing_port(ds, device); } /* Return the local port used to reach the dedicated CPU port */ static inline unsigned int dsa_upstream_port(struct dsa_switch *ds, int port) { const struct dsa_port *dp = dsa_to_port(ds, port); const struct dsa_port *cpu_dp = dp->cpu_dp; if (!cpu_dp) return port; return dsa_towards_port(ds, cpu_dp->ds->index, cpu_dp->index); } /* Return true if this is the local port used to reach the CPU port */ static inline bool dsa_is_upstream_port(struct dsa_switch *ds, int port) { if (dsa_is_unused_port(ds, port)) return false; return port == dsa_upstream_port(ds, port); } /* Return true if this is a DSA port leading away from the CPU */ static inline bool dsa_is_downstream_port(struct dsa_switch *ds, int port) { return dsa_is_dsa_port(ds, port) && !dsa_is_upstream_port(ds, port); } /* Return the local port used to reach the CPU port */ static inline unsigned int dsa_switch_upstream_port(struct dsa_switch *ds) { struct dsa_port *dp; dsa_switch_for_each_available_port(dp, ds) { return dsa_upstream_port(ds, dp->index); } return ds->num_ports; } /* Return true if @upstream_ds is an upstream switch of @downstream_ds, meaning * that the routing port from @downstream_ds to @upstream_ds is also the port * which @downstream_ds uses to reach its dedicated CPU. */ static inline bool dsa_switch_is_upstream_of(struct dsa_switch *upstream_ds, struct dsa_switch *downstream_ds) { int routing_port; if (upstream_ds == downstream_ds) return true; routing_port = dsa_routing_port(downstream_ds, upstream_ds->index); return dsa_is_upstream_port(downstream_ds, routing_port); } static inline bool dsa_port_is_vlan_filtering(const struct dsa_port *dp) { const struct dsa_switch *ds = dp->ds; if (ds->vlan_filtering_is_global) return ds->vlan_filtering; else return dp->vlan_filtering; } static inline unsigned int dsa_port_lag_id_get(struct dsa_port *dp) { return dp->lag ? dp->lag->id : 0; } static inline struct net_device *dsa_port_lag_dev_get(struct dsa_port *dp) { return dp->lag ? dp->lag->dev : NULL; } static inline bool dsa_port_offloads_lag(struct dsa_port *dp, const struct dsa_lag *lag) { return dsa_port_lag_dev_get(dp) == lag->dev; } static inline struct net_device *dsa_port_to_conduit(const struct dsa_port *dp) { if (dp->cpu_port_in_lag) return dsa_port_lag_dev_get(dp->cpu_dp); return dp->cpu_dp->conduit; } static inline struct net_device *dsa_port_to_bridge_port(const struct dsa_port *dp) { if (!dp->bridge) return NULL; if (dp->lag) return dp->lag->dev; else if (dp->hsr_dev) return dp->hsr_dev; return dp->user; } static inline struct net_device * dsa_port_bridge_dev_get(const struct dsa_port *dp) { return dp->bridge ? dp->bridge->dev : NULL; } static inline unsigned int dsa_port_bridge_num_get(struct dsa_port *dp) { return dp->bridge ? dp->bridge->num : 0; } static inline bool dsa_port_bridge_same(const struct dsa_port *a, const struct dsa_port *b) { struct net_device *br_a = dsa_port_bridge_dev_get(a); struct net_device *br_b = dsa_port_bridge_dev_get(b); /* Standalone ports are not in the same bridge with one another */ return (!br_a || !br_b) ? false : (br_a == br_b); } static inline bool dsa_port_offloads_bridge_port(struct dsa_port *dp, const struct net_device *dev) { return dsa_port_to_bridge_port(dp) == dev; } static inline bool dsa_port_offloads_bridge_dev(struct dsa_port *dp, const struct net_device *bridge_dev) { /* DSA ports connected to a bridge, and event was emitted * for the bridge. */ return dsa_port_bridge_dev_get(dp) == bridge_dev; } static inline bool dsa_port_offloads_bridge(struct dsa_port *dp, const struct dsa_bridge *bridge) { return dsa_port_bridge_dev_get(dp) == bridge->dev; } /* Returns true if any port of this tree offloads the given net_device */ static inline bool dsa_tree_offloads_bridge_port(struct dsa_switch_tree *dst, const struct net_device *dev) { struct dsa_port *dp; list_for_each_entry(dp, &dst->ports, list) if (dsa_port_offloads_bridge_port(dp, dev)) return true; return false; } /* Returns true if any port of this tree offloads the given bridge */ static inline bool dsa_tree_offloads_bridge_dev(struct dsa_switch_tree *dst, const struct net_device *bridge_dev) { struct dsa_port *dp; list_for_each_entry(dp, &dst->ports, list) if (dsa_port_offloads_bridge_dev(dp, bridge_dev)) return true; return false; } static inline bool dsa_port_tree_same(const struct dsa_port *a, const struct dsa_port *b) { return a->ds->dst == b->ds->dst; } typedef int dsa_fdb_dump_cb_t(const unsigned char *addr, u16 vid, bool is_static, void *data); struct dsa_switch_ops { /* * Tagging protocol helpers called for the CPU ports and DSA links. * @get_tag_protocol retrieves the initial tagging protocol and is * mandatory. Switches which can operate using multiple tagging * protocols should implement @change_tag_protocol and report in * @get_tag_protocol the tagger in current use. */ enum dsa_tag_protocol (*get_tag_protocol)(struct dsa_switch *ds, int port, enum dsa_tag_protocol mprot); int (*change_tag_protocol)(struct dsa_switch *ds, enum dsa_tag_protocol proto); /* * Method for switch drivers to connect to the tagging protocol driver * in current use. The switch driver can provide handlers for certain * types of packets for switch management. */ int (*connect_tag_protocol)(struct dsa_switch *ds, enum dsa_tag_protocol proto); int (*port_change_conduit)(struct dsa_switch *ds, int port, struct net_device *conduit, struct netlink_ext_ack *extack); /* Optional switch-wide initialization and destruction methods */ int (*setup)(struct dsa_switch *ds); void (*teardown)(struct dsa_switch *ds); /* Per-port initialization and destruction methods. Mandatory if the * driver registers devlink port regions, optional otherwise. */ int (*port_setup)(struct dsa_switch *ds, int port); void (*port_teardown)(struct dsa_switch *ds, int port); u32 (*get_phy_flags)(struct dsa_switch *ds, int port); /* * Access to the switch's PHY registers. */ int (*phy_read)(struct dsa_switch *ds, int port, int regnum); int (*phy_write)(struct dsa_switch *ds, int port, int regnum, u16 val); /* * PHYLINK integration */ void (*phylink_get_caps)(struct dsa_switch *ds, int port, struct phylink_config *config); void (*phylink_fixed_state)(struct dsa_switch *ds, int port, struct phylink_link_state *state); /* * Port statistics counters. */ void (*get_strings)(struct dsa_switch *ds, int port, u32 stringset, uint8_t *data); void (*get_ethtool_stats)(struct dsa_switch *ds, int port, uint64_t *data); int (*get_sset_count)(struct dsa_switch *ds, int port, int sset); void (*get_ethtool_phy_stats)(struct dsa_switch *ds, int port, uint64_t *data); void (*get_eth_phy_stats)(struct dsa_switch *ds, int port, struct ethtool_eth_phy_stats *phy_stats); void (*get_eth_mac_stats)(struct dsa_switch *ds, int port, struct ethtool_eth_mac_stats *mac_stats); void (*get_eth_ctrl_stats)(struct dsa_switch *ds, int port, struct ethtool_eth_ctrl_stats *ctrl_stats); void (*get_rmon_stats)(struct dsa_switch *ds, int port, struct ethtool_rmon_stats *rmon_stats, const struct ethtool_rmon_hist_range **ranges); void (*get_ts_stats)(struct dsa_switch *ds, int port, struct ethtool_ts_stats *ts_stats); void (*get_stats64)(struct dsa_switch *ds, int port, struct rtnl_link_stats64 *s); void (*get_pause_stats)(struct dsa_switch *ds, int port, struct ethtool_pause_stats *pause_stats); void (*self_test)(struct dsa_switch *ds, int port, struct ethtool_test *etest, u64 *data); /* * ethtool Wake-on-LAN */ void (*get_wol)(struct dsa_switch *ds, int port, struct ethtool_wolinfo *w); int (*set_wol)(struct dsa_switch *ds, int port, struct ethtool_wolinfo *w); /* * ethtool timestamp info */ int (*get_ts_info)(struct dsa_switch *ds, int port, struct kernel_ethtool_ts_info *ts); /* * ethtool MAC merge layer */ int (*get_mm)(struct dsa_switch *ds, int port, struct ethtool_mm_state *state); int (*set_mm)(struct dsa_switch *ds, int port, struct ethtool_mm_cfg *cfg, struct netlink_ext_ack *extack); void (*get_mm_stats)(struct dsa_switch *ds, int port, struct ethtool_mm_stats *stats); /* * DCB ops */ int (*port_get_default_prio)(struct dsa_switch *ds, int port); int (*port_set_default_prio)(struct dsa_switch *ds, int port, u8 prio); int (*port_get_dscp_prio)(struct dsa_switch *ds, int port, u8 dscp); int (*port_add_dscp_prio)(struct dsa_switch *ds, int port, u8 dscp, u8 prio); int (*port_del_dscp_prio)(struct dsa_switch *ds, int port, u8 dscp, u8 prio); int (*port_set_apptrust)(struct dsa_switch *ds, int port, const u8 *sel, int nsel); int (*port_get_apptrust)(struct dsa_switch *ds, int port, u8 *sel, int *nsel); /* * Suspend and resume */ int (*suspend)(struct dsa_switch *ds); int (*resume)(struct dsa_switch *ds); /* * Port enable/disable */ int (*port_enable)(struct dsa_switch *ds, int port, struct phy_device *phy); void (*port_disable)(struct dsa_switch *ds, int port); /* * Notification for MAC address changes on user ports. Drivers can * currently only veto operations. They should not use the method to * program the hardware, since the operation is not rolled back in case * of other errors. */ int (*port_set_mac_address)(struct dsa_switch *ds, int port, const unsigned char *addr); /* * Compatibility between device trees defining multiple CPU ports and * drivers which are not OK to use by default the numerically smallest * CPU port of a switch for its local ports. This can return NULL, * meaning "don't know/don't care". */ struct dsa_port *(*preferred_default_local_cpu_port)(struct dsa_switch *ds); /* * Port's MAC EEE settings */ bool (*support_eee)(struct dsa_switch *ds, int port); int (*set_mac_eee)(struct dsa_switch *ds, int port, struct ethtool_keee *e); /* EEPROM access */ int (*get_eeprom_len)(struct dsa_switch *ds); int (*get_eeprom)(struct dsa_switch *ds, struct ethtool_eeprom *eeprom, u8 *data); int (*set_eeprom)(struct dsa_switch *ds, struct ethtool_eeprom *eeprom, u8 *data); /* * Register access. */ int (*get_regs_len)(struct dsa_switch *ds, int port); void (*get_regs)(struct dsa_switch *ds, int port, struct ethtool_regs *regs, void *p); /* * Upper device tracking. */ int (*port_prechangeupper)(struct dsa_switch *ds, int port, struct netdev_notifier_changeupper_info *info); /* * Bridge integration */ int (*set_ageing_time)(struct dsa_switch *ds, unsigned int msecs); int (*port_bridge_join)(struct dsa_switch *ds, int port, struct dsa_bridge bridge, bool *tx_fwd_offload, struct netlink_ext_ack *extack); void (*port_bridge_leave)(struct dsa_switch *ds, int port, struct dsa_bridge bridge); void (*port_stp_state_set)(struct dsa_switch *ds, int port, u8 state); int (*port_mst_state_set)(struct dsa_switch *ds, int port, const struct switchdev_mst_state *state); void (*port_fast_age)(struct dsa_switch *ds, int port); int (*port_vlan_fast_age)(struct dsa_switch *ds, int port, u16 vid); int (*port_pre_bridge_flags)(struct dsa_switch *ds, int port, struct switchdev_brport_flags flags, struct netlink_ext_ack *extack); int (*port_bridge_flags)(struct dsa_switch *ds, int port, struct switchdev_brport_flags flags, struct netlink_ext_ack *extack); void (*port_set_host_flood)(struct dsa_switch *ds, int port, bool uc, bool mc); /* * VLAN support */ int (*port_vlan_filtering)(struct dsa_switch *ds, int port, bool vlan_filtering, struct netlink_ext_ack *extack); int (*port_vlan_add)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan, struct netlink_ext_ack *extack); int (*port_vlan_del)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan); int (*vlan_msti_set)(struct dsa_switch *ds, struct dsa_bridge bridge, const struct switchdev_vlan_msti *msti); /* * Forwarding database */ int (*port_fdb_add)(struct dsa_switch *ds, int port, const unsigned char *addr, u16 vid, struct dsa_db db); int (*port_fdb_del)(struct dsa_switch *ds, int port, const unsigned char *addr, u16 vid, struct dsa_db db); int (*port_fdb_dump)(struct dsa_switch *ds, int port, dsa_fdb_dump_cb_t *cb, void *data); int (*lag_fdb_add)(struct dsa_switch *ds, struct dsa_lag lag, const unsigned char *addr, u16 vid, struct dsa_db db); int (*lag_fdb_del)(struct dsa_switch *ds, struct dsa_lag lag, const unsigned char *addr, u16 vid, struct dsa_db db); /* * Multicast database */ int (*port_mdb_add)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_mdb *mdb, struct dsa_db db); int (*port_mdb_del)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_mdb *mdb, struct dsa_db db); /* * RXNFC */ int (*get_rxnfc)(struct dsa_switch *ds, int port, struct ethtool_rxnfc *nfc, u32 *rule_locs); int (*set_rxnfc)(struct dsa_switch *ds, int port, struct ethtool_rxnfc *nfc); /* * TC integration */ int (*cls_flower_add)(struct dsa_switch *ds, int port, struct flow_cls_offload *cls, bool ingress); int (*cls_flower_del)(struct dsa_switch *ds, int port, struct flow_cls_offload *cls, bool ingress); int (*cls_flower_stats)(struct dsa_switch *ds, int port, struct flow_cls_offload *cls, bool ingress); int (*port_mirror_add)(struct dsa_switch *ds, int port, struct dsa_mall_mirror_tc_entry *mirror, bool ingress, struct netlink_ext_ack *extack); void (*port_mirror_del)(struct dsa_switch *ds, int port, struct dsa_mall_mirror_tc_entry *mirror); int (*port_policer_add)(struct dsa_switch *ds, int port, struct dsa_mall_policer_tc_entry *policer); void (*port_policer_del)(struct dsa_switch *ds, int port); int (*port_setup_tc)(struct dsa_switch *ds, int port, enum tc_setup_type type, void *type_data); /* * Cross-chip operations */ int (*crosschip_bridge_join)(struct dsa_switch *ds, int tree_index, int sw_index, int port, struct dsa_bridge bridge, struct netlink_ext_ack *extack); void (*crosschip_bridge_leave)(struct dsa_switch *ds, int tree_index, int sw_index, int port, struct dsa_bridge bridge); int (*crosschip_lag_change)(struct dsa_switch *ds, int sw_index, int port); int (*crosschip_lag_join)(struct dsa_switch *ds, int sw_index, int port, struct dsa_lag lag, struct netdev_lag_upper_info *info, struct netlink_ext_ack *extack); int (*crosschip_lag_leave)(struct dsa_switch *ds, int sw_index, int port, struct dsa_lag lag); /* * PTP functionality */ int (*port_hwtstamp_get)(struct dsa_switch *ds, int port, struct kernel_hwtstamp_config *config); int (*port_hwtstamp_set)(struct dsa_switch *ds, int port, struct kernel_hwtstamp_config *config, struct netlink_ext_ack *extack); void (*port_txtstamp)(struct dsa_switch *ds, int port, struct sk_buff *skb); bool (*port_rxtstamp)(struct dsa_switch *ds, int port, struct sk_buff *skb, unsigned int type); /* Devlink parameters, etc */ int (*devlink_param_get)(struct dsa_switch *ds, u32 id, struct devlink_param_gset_ctx *ctx); int (*devlink_param_set)(struct dsa_switch *ds, u32 id, struct devlink_param_gset_ctx *ctx); int (*devlink_info_get)(struct dsa_switch *ds, struct devlink_info_req *req, struct netlink_ext_ack *extack); int (*devlink_sb_pool_get)(struct dsa_switch *ds, unsigned int sb_index, u16 pool_index, struct devlink_sb_pool_info *pool_info); int (*devlink_sb_pool_set)(struct dsa_switch *ds, unsigned int sb_index, u16 pool_index, u32 size, enum devlink_sb_threshold_type threshold_type, struct netlink_ext_ack *extack); int (*devlink_sb_port_pool_get)(struct dsa_switch *ds, int port, unsigned int sb_index, u16 pool_index, u32 *p_threshold); int (*devlink_sb_port_pool_set)(struct dsa_switch *ds, int port, unsigned int sb_index, u16 pool_index, u32 threshold, struct netlink_ext_ack *extack); int (*devlink_sb_tc_pool_bind_get)(struct dsa_switch *ds, int port, unsigned int sb_index, u16 tc_index, enum devlink_sb_pool_type pool_type, u16 *p_pool_index, u32 *p_threshold); int (*devlink_sb_tc_pool_bind_set)(struct dsa_switch *ds, int port, unsigned int sb_index, u16 tc_index, enum devlink_sb_pool_type pool_type, u16 pool_index, u32 threshold, struct netlink_ext_ack *extack); int (*devlink_sb_occ_snapshot)(struct dsa_switch *ds, unsigned int sb_index); int (*devlink_sb_occ_max_clear)(struct dsa_switch *ds, unsigned int sb_index); int (*devlink_sb_occ_port_pool_get)(struct dsa_switch *ds, int port, unsigned int sb_index, u16 pool_index, u32 *p_cur, u32 *p_max); int (*devlink_sb_occ_tc_port_bind_get)(struct dsa_switch *ds, int port, unsigned int sb_index, u16 tc_index, enum devlink_sb_pool_type pool_type, u32 *p_cur, u32 *p_max); /* * MTU change functionality. Switches can also adjust their MRU through * this method. By MTU, one understands the SDU (L2 payload) length. * If the switch needs to account for the DSA tag on the CPU port, this * method needs to do so privately. */ int (*port_change_mtu)(struct dsa_switch *ds, int port, int new_mtu); int (*port_max_mtu)(struct dsa_switch *ds, int port); /* * LAG integration */ int (*port_lag_change)(struct dsa_switch *ds, int port); int (*port_lag_join)(struct dsa_switch *ds, int port, struct dsa_lag lag, struct netdev_lag_upper_info *info, struct netlink_ext_ack *extack); int (*port_lag_leave)(struct dsa_switch *ds, int port, struct dsa_lag lag); /* * HSR integration */ int (*port_hsr_join)(struct dsa_switch *ds, int port, struct net_device *hsr, struct netlink_ext_ack *extack); int (*port_hsr_leave)(struct dsa_switch *ds, int port, struct net_device *hsr); /* * MRP integration */ int (*port_mrp_add)(struct dsa_switch *ds, int port, const struct switchdev_obj_mrp *mrp); int (*port_mrp_del)(struct dsa_switch *ds, int port, const struct switchdev_obj_mrp *mrp); int (*port_mrp_add_ring_role)(struct dsa_switch *ds, int port, const struct switchdev_obj_ring_role_mrp *mrp); int (*port_mrp_del_ring_role)(struct dsa_switch *ds, int port, const struct switchdev_obj_ring_role_mrp *mrp); /* * tag_8021q operations */ int (*tag_8021q_vlan_add)(struct dsa_switch *ds, int port, u16 vid, u16 flags); int (*tag_8021q_vlan_del)(struct dsa_switch *ds, int port, u16 vid); /* * DSA conduit tracking operations */ void (*conduit_state_change)(struct dsa_switch *ds, const struct net_device *conduit, bool operational); }; #define DSA_DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes) \ DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes, \ dsa_devlink_param_get, dsa_devlink_param_set, NULL) int dsa_devlink_param_get(struct devlink *dl, u32 id, struct devlink_param_gset_ctx *ctx); int dsa_devlink_param_set(struct devlink *dl, u32 id, struct devlink_param_gset_ctx *ctx, struct netlink_ext_ack *extack); int dsa_devlink_params_register(struct dsa_switch *ds, const struct devlink_param *params, size_t params_count); void dsa_devlink_params_unregister(struct dsa_switch *ds, const struct devlink_param *params, size_t params_count); int dsa_devlink_resource_register(struct dsa_switch *ds, const char *resource_name, u64 resource_size, u64 resource_id, u64 parent_resource_id, const struct devlink_resource_size_params *size_params); void dsa_devlink_resources_unregister(struct dsa_switch *ds); void dsa_devlink_resource_occ_get_register(struct dsa_switch *ds, u64 resource_id, devlink_resource_occ_get_t *occ_get, void *occ_get_priv); void dsa_devlink_resource_occ_get_unregister(struct dsa_switch *ds, u64 resource_id); struct devlink_region * dsa_devlink_region_create(struct dsa_switch *ds, const struct devlink_region_ops *ops, u32 region_max_snapshots, u64 region_size); struct devlink_region * dsa_devlink_port_region_create(struct dsa_switch *ds, int port, const struct devlink_port_region_ops *ops, u32 region_max_snapshots, u64 region_size); void dsa_devlink_region_destroy(struct devlink_region *region); struct dsa_port *dsa_port_from_netdev(struct net_device *netdev); struct dsa_devlink_priv { struct dsa_switch *ds; }; static inline struct dsa_switch *dsa_devlink_to_ds(struct devlink *dl) { struct dsa_devlink_priv *dl_priv = devlink_priv(dl); return dl_priv->ds; } static inline struct dsa_switch *dsa_devlink_port_to_ds(struct devlink_port *port) { struct devlink *dl = port->devlink; struct dsa_devlink_priv *dl_priv = devlink_priv(dl); return dl_priv->ds; } static inline int dsa_devlink_port_to_port(struct devlink_port *port) { return port->index; } struct dsa_switch_driver { struct list_head list; const struct dsa_switch_ops *ops; }; bool dsa_fdb_present_in_other_db(struct dsa_switch *ds, int port, const unsigned char *addr, u16 vid, struct dsa_db db); bool dsa_mdb_present_in_other_db(struct dsa_switch *ds, int port, const struct switchdev_obj_port_mdb *mdb, struct dsa_db db); /* Keep inline for faster access in hot path */ static inline bool netdev_uses_dsa(const struct net_device *dev) { #if IS_ENABLED(CONFIG_NET_DSA) return dev->dsa_ptr && dev->dsa_ptr->rcv; #endif return false; } /* All DSA tags that push the EtherType to the right (basically all except tail * tags, which don't break dissection) can be treated the same from the * perspective of the flow dissector. * * We need to return: * - offset: the (B - A) difference between: * A. the position of the real EtherType and * B. the current skb->data (aka ETH_HLEN bytes into the frame, aka 2 bytes * after the normal EtherType was supposed to be) * The offset in bytes is exactly equal to the tagger overhead (and half of * that, in __be16 shorts). * * - proto: the value of the real EtherType. */ static inline void dsa_tag_generic_flow_dissect(const struct sk_buff *skb, __be16 *proto, int *offset) { #if IS_ENABLED(CONFIG_NET_DSA) const struct dsa_device_ops *ops = skb->dev->dsa_ptr->tag_ops; int tag_len = ops->needed_headroom; *offset = tag_len; *proto = ((__be16 *)skb->data)[(tag_len / 2) - 1]; #endif } void dsa_unregister_switch(struct dsa_switch *ds); int dsa_register_switch(struct dsa_switch *ds); void dsa_switch_shutdown(struct dsa_switch *ds); struct dsa_switch *dsa_switch_find(int tree_index, int sw_index); void dsa_flush_workqueue(void); #ifdef CONFIG_PM_SLEEP int dsa_switch_suspend(struct dsa_switch *ds); int dsa_switch_resume(struct dsa_switch *ds); #else static inline int dsa_switch_suspend(struct dsa_switch *ds) { return 0; } static inline int dsa_switch_resume(struct dsa_switch *ds) { return 0; } #endif /* CONFIG_PM_SLEEP */ #if IS_ENABLED(CONFIG_NET_DSA) bool dsa_user_dev_check(const struct net_device *dev); #else static inline bool dsa_user_dev_check(const struct net_device *dev) { return false; } #endif netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev); void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up); bool dsa_supports_eee(struct dsa_switch *ds, int port); #endif
14 1 13 7 7 4 4 4 4 3 3 1 2 1 1 1 55 6 51 40 12 3 5 29 2 3 3 21 4 3 1 2 1 1 1 2 1 1 2 114 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C)2003,2004 USAGI/WIDE Project * * Author: * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> */ #include <linux/types.h> #include <linux/timer.h> #include <linux/module.h> #include <linux/netfilter.h> #include <linux/in6.h> #include <linux/icmpv6.h> #include <linux/ipv6.h> #include <net/ipv6.h> #include <net/ip6_checksum.h> #include <linux/seq_file.h> #include <linux/netfilter_ipv6.h> #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_log.h> #include "nf_internals.h" static const unsigned int nf_ct_icmpv6_timeout = 30*HZ; bool icmpv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct net *net, struct nf_conntrack_tuple *tuple) { const struct icmp6hdr *hp; struct icmp6hdr _hdr; hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); if (hp == NULL) return false; tuple->dst.u.icmp.type = hp->icmp6_type; tuple->src.u.icmp.id = hp->icmp6_identifier; tuple->dst.u.icmp.code = hp->icmp6_code; return true; } /* Add 1; spaces filled with 0. */ static const u_int8_t invmap[] = { [ICMPV6_ECHO_REQUEST - 128] = ICMPV6_ECHO_REPLY + 1, [ICMPV6_ECHO_REPLY - 128] = ICMPV6_ECHO_REQUEST + 1, [ICMPV6_NI_QUERY - 128] = ICMPV6_NI_REPLY + 1, [ICMPV6_NI_REPLY - 128] = ICMPV6_NI_QUERY + 1 }; static const u_int8_t noct_valid_new[] = { [ICMPV6_MGM_QUERY - 130] = 1, [ICMPV6_MGM_REPORT - 130] = 1, [ICMPV6_MGM_REDUCTION - 130] = 1, [NDISC_ROUTER_SOLICITATION - 130] = 1, [NDISC_ROUTER_ADVERTISEMENT - 130] = 1, [NDISC_NEIGHBOUR_SOLICITATION - 130] = 1, [NDISC_NEIGHBOUR_ADVERTISEMENT - 130] = 1, [ICMPV6_MLD2_REPORT - 130] = 1, [ICMPV6_MRDISC_ADV - 130] = 1, [ICMPV6_MRDISC_SOL - 130] = 1 }; bool nf_conntrack_invert_icmpv6_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig) { int type = orig->dst.u.icmp.type - 128; if (type < 0 || type >= sizeof(invmap) || !invmap[type]) return false; tuple->src.u.icmp.id = orig->src.u.icmp.id; tuple->dst.u.icmp.type = invmap[type] - 1; tuple->dst.u.icmp.code = orig->dst.u.icmp.code; return true; } static unsigned int *icmpv6_get_timeouts(struct net *net) { return &nf_icmpv6_pernet(net)->timeout; } /* Returns verdict for packet, or -1 for invalid. */ int nf_conntrack_icmpv6_packet(struct nf_conn *ct, struct sk_buff *skb, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state) { unsigned int *timeout = nf_ct_timeout_lookup(ct); static const u8 valid_new[] = { [ICMPV6_ECHO_REQUEST - 128] = 1, [ICMPV6_NI_QUERY - 128] = 1 }; if (state->pf != NFPROTO_IPV6) return -NF_ACCEPT; if (!nf_ct_is_confirmed(ct)) { int type = ct->tuplehash[0].tuple.dst.u.icmp.type - 128; if (type < 0 || type >= sizeof(valid_new) || !valid_new[type]) { /* Can't create a new ICMPv6 `conn' with this. */ pr_debug("icmpv6: can't create new conn with type %u\n", type + 128); nf_ct_dump_tuple_ipv6(&ct->tuplehash[0].tuple); return -NF_ACCEPT; } } if (!timeout) timeout = icmpv6_get_timeouts(nf_ct_net(ct)); /* Do not immediately delete the connection after the first successful reply to avoid excessive conntrackd traffic and also to handle correctly ICMP echo reply duplicates. */ nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); return NF_ACCEPT; } static void icmpv6_error_log(const struct sk_buff *skb, const struct nf_hook_state *state, const char *msg) { nf_l4proto_log_invalid(skb, state, IPPROTO_ICMPV6, "%s", msg); } static noinline_for_stack int nf_conntrack_icmpv6_redirect(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state) { u8 hl = ipv6_hdr(skb)->hop_limit; union nf_inet_addr outer_daddr; union { struct nd_opt_hdr nd_opt; struct rd_msg rd_msg; } tmp; const struct nd_opt_hdr *nd_opt; const struct rd_msg *rd_msg; rd_msg = skb_header_pointer(skb, dataoff, sizeof(*rd_msg), &tmp.rd_msg); if (!rd_msg) { icmpv6_error_log(skb, state, "short redirect"); return -NF_ACCEPT; } if (rd_msg->icmph.icmp6_code != 0) return NF_ACCEPT; if (hl != 255 || !(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { icmpv6_error_log(skb, state, "invalid saddr or hoplimit for redirect"); return -NF_ACCEPT; } dataoff += sizeof(*rd_msg); /* warning: rd_msg no longer usable after this call */ nd_opt = skb_header_pointer(skb, dataoff, sizeof(*nd_opt), &tmp.nd_opt); if (!nd_opt || nd_opt->nd_opt_len == 0) { icmpv6_error_log(skb, state, "redirect without options"); return -NF_ACCEPT; } /* We could call ndisc_parse_options(), but it would need * skb_linearize() and a bit more work. */ if (nd_opt->nd_opt_type != ND_OPT_REDIRECT_HDR) return NF_ACCEPT; memcpy(&outer_daddr.ip6, &ipv6_hdr(skb)->daddr, sizeof(outer_daddr.ip6)); dataoff += 8; return nf_conntrack_inet_error(tmpl, skb, dataoff, state, IPPROTO_ICMPV6, &outer_daddr); } int nf_conntrack_icmpv6_error(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state) { union nf_inet_addr outer_daddr; const struct icmp6hdr *icmp6h; struct icmp6hdr _ih; int type; icmp6h = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih); if (icmp6h == NULL) { icmpv6_error_log(skb, state, "short packet"); return -NF_ACCEPT; } if (state->hook == NF_INET_PRE_ROUTING && state->net->ct.sysctl_checksum && nf_ip6_checksum(skb, state->hook, dataoff, IPPROTO_ICMPV6)) { icmpv6_error_log(skb, state, "ICMPv6 checksum failed"); return -NF_ACCEPT; } type = icmp6h->icmp6_type - 130; if (type >= 0 && type < sizeof(noct_valid_new) && noct_valid_new[type]) { nf_ct_set(skb, NULL, IP_CT_UNTRACKED); return NF_ACCEPT; } if (icmp6h->icmp6_type == NDISC_REDIRECT) return nf_conntrack_icmpv6_redirect(tmpl, skb, dataoff, state); /* is not error message ? */ if (icmp6h->icmp6_type >= 128) return NF_ACCEPT; memcpy(&outer_daddr.ip6, &ipv6_hdr(skb)->daddr, sizeof(outer_daddr.ip6)); dataoff += sizeof(*icmp6h); return nf_conntrack_inet_error(tmpl, skb, dataoff, state, IPPROTO_ICMPV6, &outer_daddr); } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> static int icmpv6_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *t) { if (nla_put_be16(skb, CTA_PROTO_ICMPV6_ID, t->src.u.icmp.id) || nla_put_u8(skb, CTA_PROTO_ICMPV6_TYPE, t->dst.u.icmp.type) || nla_put_u8(skb, CTA_PROTO_ICMPV6_CODE, t->dst.u.icmp.code)) goto nla_put_failure; return 0; nla_put_failure: return -1; } static const struct nla_policy icmpv6_nla_policy[CTA_PROTO_MAX+1] = { [CTA_PROTO_ICMPV6_TYPE] = { .type = NLA_U8 }, [CTA_PROTO_ICMPV6_CODE] = { .type = NLA_U8 }, [CTA_PROTO_ICMPV6_ID] = { .type = NLA_U16 }, }; static int icmpv6_nlattr_to_tuple(struct nlattr *tb[], struct nf_conntrack_tuple *tuple, u_int32_t flags) { if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_TYPE)) { if (!tb[CTA_PROTO_ICMPV6_TYPE]) return -EINVAL; tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMPV6_TYPE]); if (tuple->dst.u.icmp.type < 128 || tuple->dst.u.icmp.type - 128 >= sizeof(invmap) || !invmap[tuple->dst.u.icmp.type - 128]) return -EINVAL; } if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_CODE)) { if (!tb[CTA_PROTO_ICMPV6_CODE]) return -EINVAL; tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMPV6_CODE]); } if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_ID)) { if (!tb[CTA_PROTO_ICMPV6_ID]) return -EINVAL; tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMPV6_ID]); } return 0; } static unsigned int icmpv6_nlattr_tuple_size(void) { static unsigned int size __read_mostly; if (!size) size = nla_policy_len(icmpv6_nla_policy, CTA_PROTO_MAX + 1); return size; } #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> static int icmpv6_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { unsigned int *timeout = data; struct nf_icmp_net *in = nf_icmpv6_pernet(net); if (!timeout) timeout = icmpv6_get_timeouts(net); if (tb[CTA_TIMEOUT_ICMPV6_TIMEOUT]) { *timeout = ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMPV6_TIMEOUT])) * HZ; } else { /* Set default ICMPv6 timeout. */ *timeout = in->timeout; } return 0; } static int icmpv6_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) { const unsigned int *timeout = data; if (nla_put_be32(skb, CTA_TIMEOUT_ICMPV6_TIMEOUT, htonl(*timeout / HZ))) goto nla_put_failure; return 0; nla_put_failure: return -ENOSPC; } static const struct nla_policy icmpv6_timeout_nla_policy[CTA_TIMEOUT_ICMPV6_MAX+1] = { [CTA_TIMEOUT_ICMPV6_TIMEOUT] = { .type = NLA_U32 }, }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ void nf_conntrack_icmpv6_init_net(struct net *net) { struct nf_icmp_net *in = nf_icmpv6_pernet(net); in->timeout = nf_ct_icmpv6_timeout; } const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 = { .l4proto = IPPROTO_ICMPV6, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = icmpv6_tuple_to_nlattr, .nlattr_tuple_size = icmpv6_nlattr_tuple_size, .nlattr_to_tuple = icmpv6_nlattr_to_tuple, .nla_policy = icmpv6_nla_policy, #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = icmpv6_timeout_nlattr_to_obj, .obj_to_nlattr = icmpv6_timeout_obj_to_nlattr, .nlattr_max = CTA_TIMEOUT_ICMP_MAX, .obj_size = sizeof(unsigned int), .nla_policy = icmpv6_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ };
357 450 450 6 443 450 356 357 357 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/errno.h> #include <linux/unistd.h> #include <linux/uaccess.h> #include <linux/syscalls.h> #include <asm/ucontext.h> #include <asm/fpu/signal.h> #include <asm/sighandling.h> #include <asm/syscall.h> #include <asm/sigframe.h> #include <asm/signal.h> /* * If regs->ss will cause an IRET fault, change it. Otherwise leave it * alone. Using this generally makes no sense unless * user_64bit_mode(regs) would return true. */ static void force_valid_ss(struct pt_regs *regs) { u32 ar; asm volatile ("lar %[old_ss], %[ar]\n\t" "jz 1f\n\t" /* If invalid: */ "xorl %[ar], %[ar]\n\t" /* set ar = 0 */ "1:" : [ar] "=r" (ar) : [old_ss] "rm" ((u16)regs->ss)); /* * For a valid 64-bit user context, we need DPL 3, type * read-write data or read-write exp-down data, and S and P * set. We can't use VERW because VERW doesn't check the * P bit. */ ar &= AR_DPL_MASK | AR_S | AR_P | AR_TYPE_MASK; if (ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA) && ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA_EXPDOWN)) regs->ss = __USER_DS; } static bool restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *usc, unsigned long uc_flags) { struct sigcontext sc; /* Always make any pending restarted system calls return -EINTR */ current->restart_block.fn = do_no_restart_syscall; if (copy_from_user(&sc, usc, offsetof(struct sigcontext, reserved1))) return false; regs->bx = sc.bx; regs->cx = sc.cx; regs->dx = sc.dx; regs->si = sc.si; regs->di = sc.di; regs->bp = sc.bp; regs->ax = sc.ax; regs->sp = sc.sp; regs->ip = sc.ip; regs->r8 = sc.r8; regs->r9 = sc.r9; regs->r10 = sc.r10; regs->r11 = sc.r11; regs->r12 = sc.r12; regs->r13 = sc.r13; regs->r14 = sc.r14; regs->r15 = sc.r15; /* Get CS/SS and force CPL3 */ regs->cs = sc.cs | 0x03; regs->ss = sc.ss | 0x03; regs->flags = (regs->flags & ~FIX_EFLAGS) | (sc.flags & FIX_EFLAGS); /* disable syscall checks */ regs->orig_ax = -1; /* * Fix up SS if needed for the benefit of old DOSEMU and * CRIU. */ if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) && user_64bit_mode(regs))) force_valid_ss(regs); return fpu__restore_sig((void __user *)sc.fpstate, 0); } static __always_inline int __unsafe_setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, struct pt_regs *regs, unsigned long mask) { unsafe_put_user(regs->di, &sc->di, Efault); unsafe_put_user(regs->si, &sc->si, Efault); unsafe_put_user(regs->bp, &sc->bp, Efault); unsafe_put_user(regs->sp, &sc->sp, Efault); unsafe_put_user(regs->bx, &sc->bx, Efault); unsafe_put_user(regs->dx, &sc->dx, Efault); unsafe_put_user(regs->cx, &sc->cx, Efault); unsafe_put_user(regs->ax, &sc->ax, Efault); unsafe_put_user(regs->r8, &sc->r8, Efault); unsafe_put_user(regs->r9, &sc->r9, Efault); unsafe_put_user(regs->r10, &sc->r10, Efault); unsafe_put_user(regs->r11, &sc->r11, Efault); unsafe_put_user(regs->r12, &sc->r12, Efault); unsafe_put_user(regs->r13, &sc->r13, Efault); unsafe_put_user(regs->r14, &sc->r14, Efault); unsafe_put_user(regs->r15, &sc->r15, Efault); unsafe_put_user(current->thread.trap_nr, &sc->trapno, Efault); unsafe_put_user(current->thread.error_code, &sc->err, Efault); unsafe_put_user(regs->ip, &sc->ip, Efault); unsafe_put_user(regs->flags, &sc->flags, Efault); unsafe_put_user(regs->cs, &sc->cs, Efault); unsafe_put_user(0, &sc->gs, Efault); unsafe_put_user(0, &sc->fs, Efault); unsafe_put_user(regs->ss, &sc->ss, Efault); unsafe_put_user(fpstate, (unsigned long __user *)&sc->fpstate, Efault); /* non-iBCS2 extensions.. */ unsafe_put_user(mask, &sc->oldmask, Efault); unsafe_put_user(current->thread.cr2, &sc->cr2, Efault); return 0; Efault: return -EFAULT; } #define unsafe_put_sigcontext(sc, fp, regs, set, label) \ do { \ if (__unsafe_setup_sigcontext(sc, fp, regs, set->sig[0])) \ goto label; \ } while(0); #define unsafe_put_sigmask(set, frame, label) \ unsafe_put_user(*(__u64 *)(set), \ (__u64 __user *)&(frame)->uc.uc_sigmask, \ label) static unsigned long frame_uc_flags(struct pt_regs *regs) { unsigned long flags; if (boot_cpu_has(X86_FEATURE_XSAVE)) flags = UC_FP_XSTATE | UC_SIGCONTEXT_SS; else flags = UC_SIGCONTEXT_SS; if (likely(user_64bit_mode(regs))) flags |= UC_STRICT_RESTORE_SS; return flags; } int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) { sigset_t *set = sigmask_to_save(); struct rt_sigframe __user *frame; void __user *fp = NULL; unsigned long uc_flags; /* x86-64 should always use SA_RESTORER. */ if (!(ksig->ka.sa.sa_flags & SA_RESTORER)) return -EFAULT; frame = get_sigframe(ksig, regs, sizeof(struct rt_sigframe), &fp); uc_flags = frame_uc_flags(regs); if (!user_access_begin(frame, sizeof(*frame))) return -EFAULT; /* Create the ucontext. */ unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault); unsafe_put_user(0, &frame->uc.uc_link, Efault); unsafe_save_altstack(&frame->uc.uc_stack, regs->sp, Efault); /* Set up to return from userspace. If provided, use a stub already in userspace. */ unsafe_put_user(ksig->ka.sa.sa_restorer, &frame->pretcode, Efault); unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault); unsafe_put_sigmask(set, frame, Efault); user_access_end(); if (ksig->ka.sa.sa_flags & SA_SIGINFO) { if (copy_siginfo_to_user(&frame->info, &ksig->info)) return -EFAULT; } if (setup_signal_shadow_stack(ksig)) return -EFAULT; /* Set up registers for signal handler */ regs->di = ksig->sig; /* In case the signal handler was declared without prototypes */ regs->ax = 0; /* This also works for non SA_SIGINFO handlers because they expect the next argument after the signal number on the stack. */ regs->si = (unsigned long)&frame->info; regs->dx = (unsigned long)&frame->uc; regs->ip = (unsigned long) ksig->ka.sa.sa_handler; regs->sp = (unsigned long)frame; /* * Set up the CS and SS registers to run signal handlers in * 64-bit mode, even if the handler happens to be interrupting * 32-bit or 16-bit code. * * SS is subtle. In 64-bit mode, we don't need any particular * SS descriptor, but we do need SS to be valid. It's possible * that the old SS is entirely bogus -- this can happen if the * signal we're trying to deliver is #GP or #SS caused by a bad * SS value. We also have a compatibility issue here: DOSEMU * relies on the contents of the SS register indicating the * SS value at the time of the signal, even though that code in * DOSEMU predates sigreturn's ability to restore SS. (DOSEMU * avoids relying on sigreturn to restore SS; instead it uses * a trampoline.) So we do our best: if the old SS was valid, * we keep it. Otherwise we replace it. */ regs->cs = __USER_CS; if (unlikely(regs->ss != __USER_DS)) force_valid_ss(regs); return 0; Efault: user_access_end(); return -EFAULT; } /* * Do a signal return; undo the signal stack. */ SYSCALL_DEFINE0(rt_sigreturn) { struct pt_regs *regs = current_pt_regs(); struct rt_sigframe __user *frame; sigset_t set; unsigned long uc_flags; frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); if (!access_ok(frame, sizeof(*frame))) goto badframe; if (__get_user(*(__u64 *)&set, (__u64 __user *)&frame->uc.uc_sigmask)) goto badframe; if (__get_user(uc_flags, &frame->uc.uc_flags)) goto badframe; set_current_blocked(&set); if (restore_altstack(&frame->uc.uc_stack)) goto badframe; if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) goto badframe; if (restore_signal_shadow_stack()) goto badframe; return regs->ax; badframe: signal_fault(regs, frame, "rt_sigreturn"); return 0; } #ifdef CONFIG_X86_X32_ABI static int x32_copy_siginfo_to_user(struct compat_siginfo __user *to, const struct kernel_siginfo *from) { struct compat_siginfo new; copy_siginfo_to_external32(&new, from); if (from->si_signo == SIGCHLD) { new._sifields._sigchld_x32._utime = from->si_utime; new._sifields._sigchld_x32._stime = from->si_stime; } if (copy_to_user(to, &new, sizeof(struct compat_siginfo))) return -EFAULT; return 0; } int copy_siginfo_to_user32(struct compat_siginfo __user *to, const struct kernel_siginfo *from) { if (in_x32_syscall()) return x32_copy_siginfo_to_user(to, from); return __copy_siginfo_to_user32(to, from); } int x32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) { compat_sigset_t *set = (compat_sigset_t *) sigmask_to_save(); struct rt_sigframe_x32 __user *frame; unsigned long uc_flags; void __user *restorer; void __user *fp = NULL; if (!(ksig->ka.sa.sa_flags & SA_RESTORER)) return -EFAULT; frame = get_sigframe(ksig, regs, sizeof(*frame), &fp); uc_flags = frame_uc_flags(regs); if (setup_signal_shadow_stack(ksig)) return -EFAULT; if (!user_access_begin(frame, sizeof(*frame))) return -EFAULT; /* Create the ucontext. */ unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault); unsafe_put_user(0, &frame->uc.uc_link, Efault); unsafe_compat_save_altstack(&frame->uc.uc_stack, regs->sp, Efault); unsafe_put_user(0, &frame->uc.uc__pad0, Efault); restorer = ksig->ka.sa.sa_restorer; unsafe_put_user(restorer, (unsigned long __user *)&frame->pretcode, Efault); unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault); unsafe_put_sigmask(set, frame, Efault); user_access_end(); if (ksig->ka.sa.sa_flags & SA_SIGINFO) { if (x32_copy_siginfo_to_user(&frame->info, &ksig->info)) return -EFAULT; } /* Set up registers for signal handler */ regs->sp = (unsigned long) frame; regs->ip = (unsigned long) ksig->ka.sa.sa_handler; /* We use the x32 calling convention here... */ regs->di = ksig->sig; regs->si = (unsigned long) &frame->info; regs->dx = (unsigned long) &frame->uc; loadsegment(ds, __USER_DS); loadsegment(es, __USER_DS); regs->cs = __USER_CS; regs->ss = __USER_DS; return 0; Efault: user_access_end(); return -EFAULT; } COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn) { struct pt_regs *regs = current_pt_regs(); struct rt_sigframe_x32 __user *frame; sigset_t set; unsigned long uc_flags; frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); if (!access_ok(frame, sizeof(*frame))) goto badframe; if (__get_user(set.sig[0], (__u64 __user *)&frame->uc.uc_sigmask)) goto badframe; if (__get_user(uc_flags, &frame->uc.uc_flags)) goto badframe; set_current_blocked(&set); if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) goto badframe; if (restore_signal_shadow_stack()) goto badframe; if (compat_restore_altstack(&frame->uc.uc_stack)) goto badframe; return regs->ax; badframe: signal_fault(regs, frame, "x32 rt_sigreturn"); return 0; } #endif /* CONFIG_X86_X32_ABI */ #ifdef CONFIG_COMPAT void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact) { if (!act) return; if (in_ia32_syscall()) act->sa.sa_flags |= SA_IA32_ABI; if (in_x32_syscall()) act->sa.sa_flags |= SA_X32_ABI; } #endif /* CONFIG_COMPAT */ /* * If adding a new si_code, there is probably new data in * the siginfo. Make sure folks bumping the si_code * limits also have to look at this code. Make sure any * new fields are handled in copy_siginfo_to_user32()! */ static_assert(NSIGILL == 11); static_assert(NSIGFPE == 15); static_assert(NSIGSEGV == 10); static_assert(NSIGBUS == 5); static_assert(NSIGTRAP == 6); static_assert(NSIGCHLD == 6); static_assert(NSIGSYS == 2); /* This is part of the ABI and can never change in size: */ static_assert(sizeof(siginfo_t) == 128); /* This is a part of the ABI and can never change in alignment */ static_assert(__alignof__(siginfo_t) == 8); /* * The offsets of all the (unioned) si_fields are fixed * in the ABI, of course. Make sure none of them ever * move and are always at the beginning: */ static_assert(offsetof(siginfo_t, si_signo) == 0); static_assert(offsetof(siginfo_t, si_errno) == 4); static_assert(offsetof(siginfo_t, si_code) == 8); /* * Ensure that the size of each si_field never changes. * If it does, it is a sign that the * copy_siginfo_to_user32() code below needs to updated * along with the size in the CHECK_SI_SIZE(). * * We repeat this check for both the generic and compat * siginfos. * * Note: it is OK for these to grow as long as the whole * structure stays within the padding size (checked * above). */ #define CHECK_SI_OFFSET(name) \ static_assert(offsetof(siginfo_t, _sifields) == \ offsetof(siginfo_t, _sifields.name)) #define CHECK_SI_SIZE(name, size) \ static_assert(sizeof_field(siginfo_t, _sifields.name) == size) CHECK_SI_OFFSET(_kill); CHECK_SI_SIZE (_kill, 2*sizeof(int)); static_assert(offsetof(siginfo_t, si_pid) == 0x10); static_assert(offsetof(siginfo_t, si_uid) == 0x14); CHECK_SI_OFFSET(_timer); CHECK_SI_SIZE (_timer, 6*sizeof(int)); static_assert(offsetof(siginfo_t, si_tid) == 0x10); static_assert(offsetof(siginfo_t, si_overrun) == 0x14); static_assert(offsetof(siginfo_t, si_value) == 0x18); CHECK_SI_OFFSET(_rt); CHECK_SI_SIZE (_rt, 4*sizeof(int)); static_assert(offsetof(siginfo_t, si_pid) == 0x10); static_assert(offsetof(siginfo_t, si_uid) == 0x14); static_assert(offsetof(siginfo_t, si_value) == 0x18); CHECK_SI_OFFSET(_sigchld); CHECK_SI_SIZE (_sigchld, 8*sizeof(int)); static_assert(offsetof(siginfo_t, si_pid) == 0x10); static_assert(offsetof(siginfo_t, si_uid) == 0x14); static_assert(offsetof(siginfo_t, si_status) == 0x18); static_assert(offsetof(siginfo_t, si_utime) == 0x20); static_assert(offsetof(siginfo_t, si_stime) == 0x28); #ifdef CONFIG_X86_X32_ABI /* no _sigchld_x32 in the generic siginfo_t */ static_assert(sizeof_field(compat_siginfo_t, _sifields._sigchld_x32) == 7*sizeof(int)); static_assert(offsetof(compat_siginfo_t, _sifields) == offsetof(compat_siginfo_t, _sifields._sigchld_x32)); static_assert(offsetof(compat_siginfo_t, _sifields._sigchld_x32._utime) == 0x18); static_assert(offsetof(compat_siginfo_t, _sifields._sigchld_x32._stime) == 0x20); #endif CHECK_SI_OFFSET(_sigfault); CHECK_SI_SIZE (_sigfault, 8*sizeof(int)); static_assert(offsetof(siginfo_t, si_addr) == 0x10); static_assert(offsetof(siginfo_t, si_trapno) == 0x18); static_assert(offsetof(siginfo_t, si_addr_lsb) == 0x18); static_assert(offsetof(siginfo_t, si_lower) == 0x20); static_assert(offsetof(siginfo_t, si_upper) == 0x28); static_assert(offsetof(siginfo_t, si_pkey) == 0x20); static_assert(offsetof(siginfo_t, si_perf_data) == 0x18); static_assert(offsetof(siginfo_t, si_perf_type) == 0x20); static_assert(offsetof(siginfo_t, si_perf_flags) == 0x24); CHECK_SI_OFFSET(_sigpoll); CHECK_SI_SIZE (_sigpoll, 4*sizeof(int)); static_assert(offsetof(siginfo_t, si_band) == 0x10); static_assert(offsetof(siginfo_t, si_fd) == 0x18); CHECK_SI_OFFSET(_sigsys); CHECK_SI_SIZE (_sigsys, 4*sizeof(int)); static_assert(offsetof(siginfo_t, si_call_addr) == 0x10); static_assert(offsetof(siginfo_t, si_syscall) == 0x18); static_assert(offsetof(siginfo_t, si_arch) == 0x1C); /* any new si_fields should be added here */
38 38 38 38 38 38 151 150 151 64 62 64 38 38 38 142 148 142 28 203 203 1 1 1 1 8 162 8 155 18 18 18 50 117 50 88 281 203 121 122 262 262 105 226 67 263 156 156 1 22 132 25 156 88 88 1 16 58 47 88 71 71 1 11 71 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/switchdev/switchdev.c - Switch device API * Copyright (c) 2014-2015 Jiri Pirko <jiri@resnulli.us> * Copyright (c) 2014-2015 Scott Feldman <sfeldma@gmail.com> */ #include <linux/kernel.h> #include <linux/types.h> #include <linux/init.h> #include <linux/mutex.h> #include <linux/notifier.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/if_bridge.h> #include <linux/list.h> #include <linux/workqueue.h> #include <linux/if_vlan.h> #include <linux/rtnetlink.h> #include <net/switchdev.h> static bool switchdev_obj_eq(const struct switchdev_obj *a, const struct switchdev_obj *b) { const struct switchdev_obj_port_vlan *va, *vb; const struct switchdev_obj_port_mdb *ma, *mb; if (a->id != b->id || a->orig_dev != b->orig_dev) return false; switch (a->id) { case SWITCHDEV_OBJ_ID_PORT_VLAN: va = SWITCHDEV_OBJ_PORT_VLAN(a); vb = SWITCHDEV_OBJ_PORT_VLAN(b); return va->flags == vb->flags && va->vid == vb->vid && va->changed == vb->changed; case SWITCHDEV_OBJ_ID_PORT_MDB: case SWITCHDEV_OBJ_ID_HOST_MDB: ma = SWITCHDEV_OBJ_PORT_MDB(a); mb = SWITCHDEV_OBJ_PORT_MDB(b); return ma->vid == mb->vid && ether_addr_equal(ma->addr, mb->addr); default: break; } BUG(); } static LIST_HEAD(deferred); static DEFINE_SPINLOCK(deferred_lock); typedef void switchdev_deferred_func_t(struct net_device *dev, const void *data); struct switchdev_deferred_item { struct list_head list; struct net_device *dev; netdevice_tracker dev_tracker; switchdev_deferred_func_t *func; unsigned long data[]; }; static struct switchdev_deferred_item *switchdev_deferred_dequeue(void) { struct switchdev_deferred_item *dfitem; spin_lock_bh(&deferred_lock); if (list_empty(&deferred)) { dfitem = NULL; goto unlock; } dfitem = list_first_entry(&deferred, struct switchdev_deferred_item, list); list_del(&dfitem->list); unlock: spin_unlock_bh(&deferred_lock); return dfitem; } /** * switchdev_deferred_process - Process ops in deferred queue * * Called to flush the ops currently queued in deferred ops queue. * rtnl_lock must be held. */ void switchdev_deferred_process(void) { struct switchdev_deferred_item *dfitem; ASSERT_RTNL(); while ((dfitem = switchdev_deferred_dequeue())) { dfitem->func(dfitem->dev, dfitem->data); netdev_put(dfitem->dev, &dfitem->dev_tracker); kfree(dfitem); } } EXPORT_SYMBOL_GPL(switchdev_deferred_process); static void switchdev_deferred_process_work(struct work_struct *work) { rtnl_lock(); switchdev_deferred_process(); rtnl_unlock(); } static DECLARE_WORK(deferred_process_work, switchdev_deferred_process_work); static int switchdev_deferred_enqueue(struct net_device *dev, const void *data, size_t data_len, switchdev_deferred_func_t *func) { struct switchdev_deferred_item *dfitem; dfitem = kmalloc(struct_size(dfitem, data, data_len), GFP_ATOMIC); if (!dfitem) return -ENOMEM; dfitem->dev = dev; dfitem->func = func; memcpy(dfitem->data, data, data_len); netdev_hold(dev, &dfitem->dev_tracker, GFP_ATOMIC); spin_lock_bh(&deferred_lock); list_add_tail(&dfitem->list, &deferred); spin_unlock_bh(&deferred_lock); schedule_work(&deferred_process_work); return 0; } static int switchdev_port_attr_notify(enum switchdev_notifier_type nt, struct net_device *dev, const struct switchdev_attr *attr, struct netlink_ext_ack *extack) { int err; int rc; struct switchdev_notifier_port_attr_info attr_info = { .attr = attr, .handled = false, }; rc = call_switchdev_blocking_notifiers(nt, dev, &attr_info.info, extack); err = notifier_to_errno(rc); if (err) { WARN_ON(!attr_info.handled); return err; } if (!attr_info.handled) return -EOPNOTSUPP; return 0; } static int switchdev_port_attr_set_now(struct net_device *dev, const struct switchdev_attr *attr, struct netlink_ext_ack *extack) { return switchdev_port_attr_notify(SWITCHDEV_PORT_ATTR_SET, dev, attr, extack); } static void switchdev_port_attr_set_deferred(struct net_device *dev, const void *data) { const struct switchdev_attr *attr = data; int err; err = switchdev_port_attr_set_now(dev, attr, NULL); if (err && err != -EOPNOTSUPP) netdev_err(dev, "failed (err=%d) to set attribute (id=%d)\n", err, attr->id); if (attr->complete) attr->complete(dev, err, attr->complete_priv); } static int switchdev_port_attr_set_defer(struct net_device *dev, const struct switchdev_attr *attr) { return switchdev_deferred_enqueue(dev, attr, sizeof(*attr), switchdev_port_attr_set_deferred); } /** * switchdev_port_attr_set - Set port attribute * * @dev: port device * @attr: attribute to set * @extack: netlink extended ack, for error message propagation * * rtnl_lock must be held and must not be in atomic section, * in case SWITCHDEV_F_DEFER flag is not set. */ int switchdev_port_attr_set(struct net_device *dev, const struct switchdev_attr *attr, struct netlink_ext_ack *extack) { if (attr->flags & SWITCHDEV_F_DEFER) return switchdev_port_attr_set_defer(dev, attr); ASSERT_RTNL(); return switchdev_port_attr_set_now(dev, attr, extack); } EXPORT_SYMBOL_GPL(switchdev_port_attr_set); static size_t switchdev_obj_size(const struct switchdev_obj *obj) { switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_VLAN: return sizeof(struct switchdev_obj_port_vlan); case SWITCHDEV_OBJ_ID_PORT_MDB: return sizeof(struct switchdev_obj_port_mdb); case SWITCHDEV_OBJ_ID_HOST_MDB: return sizeof(struct switchdev_obj_port_mdb); default: BUG(); } return 0; } static int switchdev_port_obj_notify(enum switchdev_notifier_type nt, struct net_device *dev, const struct switchdev_obj *obj, struct netlink_ext_ack *extack) { int rc; int err; struct switchdev_notifier_port_obj_info obj_info = { .obj = obj, .handled = false, }; rc = call_switchdev_blocking_notifiers(nt, dev, &obj_info.info, extack); err = notifier_to_errno(rc); if (err) { WARN_ON(!obj_info.handled); return err; } if (!obj_info.handled) return -EOPNOTSUPP; return 0; } static void switchdev_obj_id_to_helpful_msg(struct net_device *dev, enum switchdev_obj_id obj_id, int err, bool add) { const char *action = add ? "add" : "del"; const char *reason = ""; const char *problem; const char *obj_str; switch (obj_id) { case SWITCHDEV_OBJ_ID_UNDEFINED: obj_str = "Undefined object"; problem = "Attempted operation is undefined, indicating a possible programming\n" "error.\n"; break; case SWITCHDEV_OBJ_ID_PORT_VLAN: obj_str = "VLAN entry"; problem = "Failure in VLAN settings on this port might disrupt network\n" "segmentation or traffic isolation, affecting network partitioning.\n"; break; case SWITCHDEV_OBJ_ID_PORT_MDB: obj_str = "Port Multicast Database entry"; problem = "Failure in updating the port's Multicast Database could lead to\n" "multicast forwarding issues.\n"; break; case SWITCHDEV_OBJ_ID_HOST_MDB: obj_str = "Host Multicast Database entry"; problem = "Failure in updating the host's Multicast Database may impact multicast\n" "group memberships or traffic delivery, affecting multicast\n" "communication.\n"; break; case SWITCHDEV_OBJ_ID_MRP: obj_str = "Media Redundancy Protocol configuration for port"; problem = "Failure to set MRP ring ID on this port prevents communication with\n" "the specified redundancy ring, resulting in an inability to engage\n" "in MRP-based network operations.\n"; break; case SWITCHDEV_OBJ_ID_RING_TEST_MRP: obj_str = "MRP Test Frame Operations for port"; problem = "Failure to generate/monitor MRP test frames may lead to inability to\n" "assess the ring's operational integrity and fault response, hindering\n" "proactive network management.\n"; break; case SWITCHDEV_OBJ_ID_RING_ROLE_MRP: obj_str = "MRP Ring Role Configuration"; problem = "Improper MRP ring role configuration may create conflicts in the ring,\n" "disrupting communication for all participants, or isolate the local\n" "system from the ring, hindering its ability to communicate with other\n" "participants.\n"; break; case SWITCHDEV_OBJ_ID_RING_STATE_MRP: obj_str = "MRP Ring State Configuration"; problem = "Failure to correctly set the MRP ring state can result in network\n" "loops or leave segments without communication. In a Closed state,\n" "it maintains loop prevention by blocking one MRM port, while an Open\n" "state activates in response to failures, changing port states to\n" "preserve network connectivity.\n"; break; case SWITCHDEV_OBJ_ID_IN_TEST_MRP: obj_str = "MRP_InTest Frame Generation Configuration"; problem = "Failure in managing MRP_InTest frame generation can misjudge the\n" "interconnection ring's state, leading to incorrect blocking or\n" "unblocking of the I/C port. This misconfiguration might result\n" "in unintended network loops or isolate critical network segments,\n" "compromising network integrity and reliability.\n"; break; case SWITCHDEV_OBJ_ID_IN_ROLE_MRP: obj_str = "Interconnection Ring Role Configuration"; problem = "Failure in incorrect assignment of interconnection ring roles\n" "(MIM/MIC) can impair the formation of the interconnection rings.\n"; break; case SWITCHDEV_OBJ_ID_IN_STATE_MRP: obj_str = "Interconnection Ring State Configuration"; problem = "Failure in updating the interconnection ring state can lead in\n" "case of Open state to incorrect blocking or unblocking of the\n" "I/C port, resulting in unintended network loops or isolation\n" "of critical network\n"; break; default: obj_str = "Unknown object"; problem = "Indicating a possible programming error.\n"; } switch (err) { case -ENOSPC: reason = "Current HW/SW setup lacks sufficient resources.\n"; break; } netdev_err(dev, "Failed to %s %s (object id=%d) with error: %pe (%d).\n%s%s\n", action, obj_str, obj_id, ERR_PTR(err), err, problem, reason); } static void switchdev_port_obj_add_deferred(struct net_device *dev, const void *data) { const struct switchdev_obj *obj = data; int err; ASSERT_RTNL(); err = switchdev_port_obj_notify(SWITCHDEV_PORT_OBJ_ADD, dev, obj, NULL); if (err && err != -EOPNOTSUPP) switchdev_obj_id_to_helpful_msg(dev, obj->id, err, true); if (obj->complete) obj->complete(dev, err, obj->complete_priv); } static int switchdev_port_obj_add_defer(struct net_device *dev, const struct switchdev_obj *obj) { return switchdev_deferred_enqueue(dev, obj, switchdev_obj_size(obj), switchdev_port_obj_add_deferred); } /** * switchdev_port_obj_add - Add port object * * @dev: port device * @obj: object to add * @extack: netlink extended ack * * rtnl_lock must be held and must not be in atomic section, * in case SWITCHDEV_F_DEFER flag is not set. */ int switchdev_port_obj_add(struct net_device *dev, const struct switchdev_obj *obj, struct netlink_ext_ack *extack) { if (obj->flags & SWITCHDEV_F_DEFER) return switchdev_port_obj_add_defer(dev, obj); ASSERT_RTNL(); return switchdev_port_obj_notify(SWITCHDEV_PORT_OBJ_ADD, dev, obj, extack); } EXPORT_SYMBOL_GPL(switchdev_port_obj_add); static int switchdev_port_obj_del_now(struct net_device *dev, const struct switchdev_obj *obj) { return switchdev_port_obj_notify(SWITCHDEV_PORT_OBJ_DEL, dev, obj, NULL); } static void switchdev_port_obj_del_deferred(struct net_device *dev, const void *data) { const struct switchdev_obj *obj = data; int err; err = switchdev_port_obj_del_now(dev, obj); if (err && err != -EOPNOTSUPP) switchdev_obj_id_to_helpful_msg(dev, obj->id, err, false); if (obj->complete) obj->complete(dev, err, obj->complete_priv); } static int switchdev_port_obj_del_defer(struct net_device *dev, const struct switchdev_obj *obj) { return switchdev_deferred_enqueue(dev, obj, switchdev_obj_size(obj), switchdev_port_obj_del_deferred); } /** * switchdev_port_obj_del - Delete port object * * @dev: port device * @obj: object to delete * * rtnl_lock must be held and must not be in atomic section, * in case SWITCHDEV_F_DEFER flag is not set. */ int switchdev_port_obj_del(struct net_device *dev, const struct switchdev_obj *obj) { if (obj->flags & SWITCHDEV_F_DEFER) return switchdev_port_obj_del_defer(dev, obj); ASSERT_RTNL(); return switchdev_port_obj_del_now(dev, obj); } EXPORT_SYMBOL_GPL(switchdev_port_obj_del); /** * switchdev_port_obj_act_is_deferred - Is object action pending? * * @dev: port device * @nt: type of action; add or delete * @obj: object to test * * Returns true if a deferred item is pending, which is * equivalent to the action @nt on an object @obj. * * rtnl_lock must be held. */ bool switchdev_port_obj_act_is_deferred(struct net_device *dev, enum switchdev_notifier_type nt, const struct switchdev_obj *obj) { struct switchdev_deferred_item *dfitem; bool found = false; ASSERT_RTNL(); spin_lock_bh(&deferred_lock); list_for_each_entry(dfitem, &deferred, list) { if (dfitem->dev != dev) continue; if ((dfitem->func == switchdev_port_obj_add_deferred && nt == SWITCHDEV_PORT_OBJ_ADD) || (dfitem->func == switchdev_port_obj_del_deferred && nt == SWITCHDEV_PORT_OBJ_DEL)) { if (switchdev_obj_eq((const void *)dfitem->data, obj)) { found = true; break; } } } spin_unlock_bh(&deferred_lock); return found; } EXPORT_SYMBOL_GPL(switchdev_port_obj_act_is_deferred); static ATOMIC_NOTIFIER_HEAD(switchdev_notif_chain); static RAW_NOTIFIER_HEAD(switchdev_blocking_notif_chain); /** * register_switchdev_notifier - Register notifier * @nb: notifier_block * * Register switch device notifier. */ int register_switchdev_notifier(struct notifier_block *nb) { return atomic_notifier_chain_register(&switchdev_notif_chain, nb); } EXPORT_SYMBOL_GPL(register_switchdev_notifier); /** * unregister_switchdev_notifier - Unregister notifier * @nb: notifier_block * * Unregister switch device notifier. */ int unregister_switchdev_notifier(struct notifier_block *nb) { return atomic_notifier_chain_unregister(&switchdev_notif_chain, nb); } EXPORT_SYMBOL_GPL(unregister_switchdev_notifier); /** * call_switchdev_notifiers - Call notifiers * @val: value passed unmodified to notifier function * @dev: port device * @info: notifier information data * @extack: netlink extended ack * Call all network notifier blocks. */ int call_switchdev_notifiers(unsigned long val, struct net_device *dev, struct switchdev_notifier_info *info, struct netlink_ext_ack *extack) { info->dev = dev; info->extack = extack; return atomic_notifier_call_chain(&switchdev_notif_chain, val, info); } EXPORT_SYMBOL_GPL(call_switchdev_notifiers); int register_switchdev_blocking_notifier(struct notifier_block *nb) { struct raw_notifier_head *chain = &switchdev_blocking_notif_chain; int err; rtnl_lock(); err = raw_notifier_chain_register(chain, nb); rtnl_unlock(); return err; } EXPORT_SYMBOL_GPL(register_switchdev_blocking_notifier); int unregister_switchdev_blocking_notifier(struct notifier_block *nb) { struct raw_notifier_head *chain = &switchdev_blocking_notif_chain; int err; rtnl_lock(); err = raw_notifier_chain_unregister(chain, nb); rtnl_unlock(); return err; } EXPORT_SYMBOL_GPL(unregister_switchdev_blocking_notifier); int call_switchdev_blocking_notifiers(unsigned long val, struct net_device *dev, struct switchdev_notifier_info *info, struct netlink_ext_ack *extack) { ASSERT_RTNL(); info->dev = dev; info->extack = extack; return raw_notifier_call_chain(&switchdev_blocking_notif_chain, val, info); } EXPORT_SYMBOL_GPL(call_switchdev_blocking_notifiers); struct switchdev_nested_priv { bool (*check_cb)(const struct net_device *dev); bool (*foreign_dev_check_cb)(const struct net_device *dev, const struct net_device *foreign_dev); const struct net_device *dev; struct net_device *lower_dev; }; static int switchdev_lower_dev_walk(struct net_device *lower_dev, struct netdev_nested_priv *priv) { struct switchdev_nested_priv *switchdev_priv = priv->data; bool (*foreign_dev_check_cb)(const struct net_device *dev, const struct net_device *foreign_dev); bool (*check_cb)(const struct net_device *dev); const struct net_device *dev; check_cb = switchdev_priv->check_cb; foreign_dev_check_cb = switchdev_priv->foreign_dev_check_cb; dev = switchdev_priv->dev; if (check_cb(lower_dev) && !foreign_dev_check_cb(lower_dev, dev)) { switchdev_priv->lower_dev = lower_dev; return 1; } return 0; } static struct net_device * switchdev_lower_dev_find_rcu(struct net_device *dev, bool (*check_cb)(const struct net_device *dev), bool (*foreign_dev_check_cb)(const struct net_device *dev, const struct net_device *foreign_dev)) { struct switchdev_nested_priv switchdev_priv = { .check_cb = check_cb, .foreign_dev_check_cb = foreign_dev_check_cb, .dev = dev, .lower_dev = NULL, }; struct netdev_nested_priv priv = { .data = &switchdev_priv, }; netdev_walk_all_lower_dev_rcu(dev, switchdev_lower_dev_walk, &priv); return switchdev_priv.lower_dev; } static struct net_device * switchdev_lower_dev_find(struct net_device *dev, bool (*check_cb)(const struct net_device *dev), bool (*foreign_dev_check_cb)(const struct net_device *dev, const struct net_device *foreign_dev)) { struct switchdev_nested_priv switchdev_priv = { .check_cb = check_cb, .foreign_dev_check_cb = foreign_dev_check_cb, .dev = dev, .lower_dev = NULL, }; struct netdev_nested_priv priv = { .data = &switchdev_priv, }; netdev_walk_all_lower_dev(dev, switchdev_lower_dev_walk, &priv); return switchdev_priv.lower_dev; } static int __switchdev_handle_fdb_event_to_device(struct net_device *dev, struct net_device *orig_dev, unsigned long event, const struct switchdev_notifier_fdb_info *fdb_info, bool (*check_cb)(const struct net_device *dev), bool (*foreign_dev_check_cb)(const struct net_device *dev, const struct net_device *foreign_dev), int (*mod_cb)(struct net_device *dev, struct net_device *orig_dev, unsigned long event, const void *ctx, const struct switchdev_notifier_fdb_info *fdb_info)) { const struct switchdev_notifier_info *info = &fdb_info->info; struct net_device *br, *lower_dev, *switchdev; struct list_head *iter; int err = -EOPNOTSUPP; if (check_cb(dev)) return mod_cb(dev, orig_dev, event, info->ctx, fdb_info); /* Recurse through lower interfaces in case the FDB entry is pointing * towards a bridge or a LAG device. */ netdev_for_each_lower_dev(dev, lower_dev, iter) { /* Do not propagate FDB entries across bridges */ if (netif_is_bridge_master(lower_dev)) continue; /* Bridge ports might be either us, or LAG interfaces * that we offload. */ if (!check_cb(lower_dev) && !switchdev_lower_dev_find_rcu(lower_dev, check_cb, foreign_dev_check_cb)) continue; err = __switchdev_handle_fdb_event_to_device(lower_dev, orig_dev, event, fdb_info, check_cb, foreign_dev_check_cb, mod_cb); if (err && err != -EOPNOTSUPP) return err; } /* Event is neither on a bridge nor a LAG. Check whether it is on an * interface that is in a bridge with us. */ br = netdev_master_upper_dev_get_rcu(dev); if (!br || !netif_is_bridge_master(br)) return 0; switchdev = switchdev_lower_dev_find_rcu(br, check_cb, foreign_dev_check_cb); if (!switchdev) return 0; if (!foreign_dev_check_cb(switchdev, dev)) return err; return __switchdev_handle_fdb_event_to_device(br, orig_dev, event, fdb_info, check_cb, foreign_dev_check_cb, mod_cb); } int switchdev_handle_fdb_event_to_device(struct net_device *dev, unsigned long event, const struct switchdev_notifier_fdb_info *fdb_info, bool (*check_cb)(const struct net_device *dev), bool (*foreign_dev_check_cb)(const struct net_device *dev, const struct net_device *foreign_dev), int (*mod_cb)(struct net_device *dev, struct net_device *orig_dev, unsigned long event, const void *ctx, const struct switchdev_notifier_fdb_info *fdb_info)) { int err; err = __switchdev_handle_fdb_event_to_device(dev, dev, event, fdb_info, check_cb, foreign_dev_check_cb, mod_cb); if (err == -EOPNOTSUPP) err = 0; return err; } EXPORT_SYMBOL_GPL(switchdev_handle_fdb_event_to_device); static int __switchdev_handle_port_obj_add(struct net_device *dev, struct switchdev_notifier_port_obj_info *port_obj_info, bool (*check_cb)(const struct net_device *dev), bool (*foreign_dev_check_cb)(const struct net_device *dev, const struct net_device